Python – HTML do PDF

Autor: chmajster 24.08.2023

Wymagania

pip3 install pdfkit

Kod

import pdfkit

def save_webpage_as_pdf(url, output_pdf_path, wkhtmltopdf_path):
    """
    Zapisuje stronę internetową jako plik PDF.
    
    :param url: Adres strony do zapisania.
    :param output_pdf_path: Ścieżka do pliku PDF, do którego zostanie zapisana strona.
    :param wkhtmltopdf_path: Ścieżka do pliku wykonywalnego wkhtmltopdf.
    """
    config = pdfkit.configuration(wkhtmltopdf=wkhtmltopdf_path)
    pdfkit.from_url(url, output_pdf_path, configuration=config)
    print(f'Strona internetowa została zapisana jako {output_pdf_path}')

# Przykład użycia
webpage_url = 'https://www.example.com'
pdf_output_path = 'output.pdf'
wkhtmltopdf_executable_path = r'C:\path\to\wkhtmltopdf.exe'

save_webpage_as_pdf(webpage_url, pdf_output_path, wkhtmltopdf_executable_path)

KOD LINUX

import pdfkit

def save_webpage_as_pdf(url, output_pdf_path):
    """
    Zapisuje stronę internetową jako plik PDF.
    
    :param url: Adres strony do zapisania.
    :param output_pdf_path: Ścieżka do pliku PDF, do którego zostanie zapisana strona.
    """
    pdfkit.from_url(url, output_pdf_path)
    print(f'Strona internetowa została zapisana jako {output_pdf_path}')

# Przykład użycia
webpage_url = 'https://www.example.com'
pdf_output_path = 'output.pdf'

save_webpage_as_pdf(webpage_url, pdf_output_path)

Autortzacja login i haslo

import pdfkit
import requests
from requests.auth import HTTPBasicAuth  # Importujemy klasę do autoryzacji

def save_webpage_as_pdf(url, output_pdf_path, username, password):
    """
    Zapisuje stronę internetową jako plik PDF, uwierzytelniając się za pomocą loginu i hasła.
    
    :param url: Adres strony do zapisania.
    :param output_pdf_path: Ścieżka do pliku PDF, do którego zostanie zapisana strona.
    :param username: Nazwa użytkownika do autoryzacji.
    :param password: Hasło do autoryzacji.
    """
    # Tworzymy sesję z uwierzytelnieniem
    session = requests.Session()
    session.auth = HTTPBasicAuth(username, password)

    # Pobieramy zawartość strony z uwierzytelnieniem
    response = session.get(url)

    # Jeśli żądanie się powiodło, zapisujemy stronę jako PDF
    if response.status_code == 200:
        pdfkit.from_string(response.text, output_pdf_path)
        print(f'Strona internetowa została zapisana jako {output_pdf_path}')
    else:
        print('Błąd podczas pobierania zawartości strony.')

# Przykład użycia
webpage_url = 'https://www.example.com'
pdf_output_path = 'output.pdf'
username = 'your_username'
password = 'your_password'

save_webpage_as_pdf(webpage_url, pdf_output_path, username, password)

LUB

import requests
from requests.auth import HTTPBasicAuth
from weasyprint import HTML

def save_webpage_as_pdf(url, output_pdf_path, username, password):
    """
    Saves a webpage as a PDF, authenticating using username and password.
    
    :param url: URL of the webpage to save.
    :param output_pdf_path: Path to the output PDF file.
    :param username: Username for authentication.
    :param password: Password for authentication.
    """
    # Create a session with authentication
    session = requests.Session()
    session.auth = HTTPBasicAuth(username, password)

    # Get the content of the webpage with authentication
    response = session.get(url)

    # If the request was successful, save the PDF
    if response.status_code == 200:
        content = response.content

        # Generate the PDF using WeasyPrint
        html = HTML(string=content, base_url=url)
        pdf = html.write_pdf()

        # Save the PDF to the output path
        with open(output_pdf_path, 'wb') as f:
            f.write(pdf)

        print(f'Webpage saved as {output_pdf_path}')
    else:
        print('Error while fetching webpage content.')

# Example usage
webpage_url = 'https://www.example.com'
pdf_output_path = 'output.pdf'
username = 'your_username'
password = 'your_password'

save_webpage_as_pdf(webpage_url, pdf_output_path, username, password)

dodaj aby nazwa pliku byla z tytułu strony

import requests
from requests.auth import HTTPBasicAuth
from weasyprint import HTML

def save_webpage_as_pdf(url, username, password):
    """
    Saves a webpage as a PDF, authenticating using username and password.
    
    :param url: URL of the webpage to save.
    :param username: Username for authentication.
    :param password: Password for authentication.
    """
    # Create a session with authentication
    session = requests.Session()
    session.auth = HTTPBasicAuth(username, password)

    # Get the content of the webpage with authentication
    response = session.get(url)

    # If the request was successful, save the PDF
    if response.status_code == 200:
        content = response.content

        # Generate the PDF using WeasyPrint
        html = HTML(string=content, base_url=url)
        pdf = html.write_pdf()

        
        # Get the title of the webpage from the response headers
        title = response.headers.get('content-disposition')
        if title:
            title = title.split('filename=')[1].strip('""')
        else:
            title = 'output'
        pdf_output_path = f'{title}.pdf'

        # Save the PDF to the output path
        with open(pdf_output_path, 'wb') as f:
            f.write(pdf)

        print(f'Webpage saved as {pdf_output_path}')
    else:
        print('Error while fetching webpage content.')

# Example usage
webpage_url = 'https://www.example.com'
username = 'your_username'
password = 'your_password'

save_webpage_as_pdf(webpage_url, username, password)

import requests
from requests.auth import HTTPBasicAuth
from weasyprint import HTML
from bs4 import BeautifulSoup

def get_title(html_content):
    """
    Extracts the title from HTML content.
    
    :param html_content: HTML content of the webpage.
    :return: Title of the webpage.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    title_tag = soup.find('title')
    return title_tag.string.strip() if title_tag else 'untitled'

def save_webpage_as_pdf(url, username, password):
    """
    Saves a webpage as a PDF, authenticating using username and password.
    
    :param url: URL of the webpage to save.
    :param username: Username for authentication.
    :param password: Password for authentication.
    """
    # Create a session with authentication
    session = requests.Session()
    session.auth = HTTPBasicAuth(username, password)

    # Get the content of the webpage with authentication
    response = session.get(url)

    # If the request was successful, save the PDF
    if response.status_code == 200:
        content = response.content

        # Get the title of the webpage
        title = get_title(content)

        pdf_output_path = f'{title}.pdf'

        # Generate the PDF using WeasyPrint
        html = HTML(string=content, base_url=url)
        pdf = html.write_pdf()

        # Save the PDF to the output path
        with open(pdf_output_path, 'wb') as f:
            f.write(pdf)

        print(f'Webpage saved as {pdf_output_path}')
    else:
        print('Error while fetching webpage content.')

# Example usage
webpage_url = 'https://www.example.com'
username = 'your_username'
password = 'your_password'

save_webpage_as_pdf(webpage_url, username, password)

Tagi:

Dodaj komentarz

Anuluj pisanie odpowiedzi

Search

O Blogu

Cześć Podróżniku!

Ta strona ma nie być typowym poradnikiem w IT, Głównym jej cel to zapisanie krótkich notatek, które mogą się przydać w codziennym życiu podczas korzystania/konfiguracji różnych urządzeń np. Ustawienia DHCP na Routerze Cisco, Ustawieniu Karty sieciowej na Linuxie itp.

Buy me A coffee

[Empty]

Welcome on my blog