How to create a searchable PDF using Python and Selenium?

I want to create a program like FireShot (premium version) to take a webpage on chromedriver and convert it into a pdf.

Currently this is the code I came up with:

import time
import os
import glob
import base64
from PyPDF2 import PdfMerger
from selenium import webdriver
from selenium.webdrivermon.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdrivermon.by import By
from selenium.webdrivermon.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
prefs = {"printing.print_preview_sticky_settings.appState": '{"recentDestinations":[{"id":"Save as PDF"}]}'}
options.add_experimental_option("prefs", prefs)
options.add_argument("--kiosk-printing")  # Auto confirm print
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Load the webpage
driver.get("/?authMode=login")

# complete the sign-in and you are redirected to another page

driver.switch_to.window(driver.window_handles[1])

def save_pdf(driver, file_name):
    params = {'landscape': False, 'paperWidth': 8.27, 'paperHeight': 11.69}
    data = driver.execute_cdp_cmd("Page.printToPDF", params)
    with open(file_name, 'wb') as file:
        file.write(base64.b64decode(data['data']))

def scroll_and_save(driver, scrollable_xpath, output_prefix):
    scrollable_div = driver.find_element(By.XPATH, scrollable_xpath)
    file_list = []
    page_num = 1
    last_scroll_position = -1
    
    while True:
        file_name = f"{output_prefix}_page_{page_num}.pdf"
        save_pdf(driver, file_name)
        file_list.append(file_name)
        
        driver.execute_script("arguments[0].scrollTop += arguments[0].clientHeight;", scrollable_div)
        time.sleep(2)  # Allow time for new content to load
        
        new_scroll_position = driver.execute_script("return arguments[0].scrollTop;", scrollable_div)
        if new_scroll_position == last_scroll_position:
            break  # Stop when scrolling reaches the end
        last_scroll_position = new_scroll_position
        page_num += 1
    
    return file_list

def merge_pdfs(file_list, output_file):
    merger = PdfMerger()
    for pdf in file_list:
        merger.append(pdf)
    merger.write(output_file)
    merger.close()
    
    # Clean up individual PDF files
    for pdf in file_list:
        os.remove(pdf)

scrollable_xpath = "/html/body/div[5]/div/div/div/div[2]/div[2]/div"
output_prefix = "practical_quiz_1"

file_list = scroll_and_save(driver, scrollable_xpath, output_prefix)
merge_pdfs(file_list, output_prefix + ".pdf")

Currently the error I am facing is that data = driver.execute_cdp_cmd("Page.printToPDF", params) only saves the top part of the webpage as a PDF even though I scroll down - how do I resolve this and maintain the text searchability in the PDF.

I want to create a program like FireShot (premium version) to take a webpage on chromedriver and convert it into a pdf.

Currently this is the code I came up with:

import time
import os
import glob
import base64
from PyPDF2 import PdfMerger
from selenium import webdriver
from selenium.webdrivermon.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdrivermon.by import By
from selenium.webdrivermon.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
prefs = {"printing.print_preview_sticky_settings.appState": '{"recentDestinations":[{"id":"Save as PDF"}]}'}
options.add_experimental_option("prefs", prefs)
options.add_argument("--kiosk-printing")  # Auto confirm print
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Load the webpage
driver.get("https://www.coursera./?authMode=login")

# complete the sign-in and you are redirected to another page

driver.switch_to.window(driver.window_handles[1])

def save_pdf(driver, file_name):
    params = {'landscape': False, 'paperWidth': 8.27, 'paperHeight': 11.69}
    data = driver.execute_cdp_cmd("Page.printToPDF", params)
    with open(file_name, 'wb') as file:
        file.write(base64.b64decode(data['data']))

def scroll_and_save(driver, scrollable_xpath, output_prefix):
    scrollable_div = driver.find_element(By.XPATH, scrollable_xpath)
    file_list = []
    page_num = 1
    last_scroll_position = -1
    
    while True:
        file_name = f"{output_prefix}_page_{page_num}.pdf"
        save_pdf(driver, file_name)
        file_list.append(file_name)
        
        driver.execute_script("arguments[0].scrollTop += arguments[0].clientHeight;", scrollable_div)
        time.sleep(2)  # Allow time for new content to load
        
        new_scroll_position = driver.execute_script("return arguments[0].scrollTop;", scrollable_div)
        if new_scroll_position == last_scroll_position:
            break  # Stop when scrolling reaches the end
        last_scroll_position = new_scroll_position
        page_num += 1
    
    return file_list

def merge_pdfs(file_list, output_file):
    merger = PdfMerger()
    for pdf in file_list:
        merger.append(pdf)
    merger.write(output_file)
    merger.close()
    
    # Clean up individual PDF files
    for pdf in file_list:
        os.remove(pdf)

scrollable_xpath = "/html/body/div[5]/div/div/div/div[2]/div[2]/div"
output_prefix = "practical_quiz_1"

file_list = scroll_and_save(driver, scrollable_xpath, output_prefix)
merge_pdfs(file_list, output_prefix + ".pdf")

Share Improve this question edited Mar 4 at 6:43 mwopitz 6584 silver badges19 bronze badges asked Mar 3 at 12:02 salt lake 332 silver badges9 bronze badges

Could you please reduce your code to a minimal snippet that shows the issue? – bfontaine Commented Mar 3 at 17:50
Maybe first use print() (and print(type(...)), print(len(...)), etc.) to see which part of code is executed and what you really have in variables. It is called "print debugging" and it helps to see what code is really doing. – furas Commented Mar 3 at 21:22

Add a comment |

1 Answer 1

Sorted by: Reset to default 1

I was unable to recreate this same exact issue for another website; for other websites when you screenshot using driver.execute_cdp_cmd("Page.printToPDF", params) the screenshot stores the entire webpage with no need to scroll - so not sure why it didn't work for Coursera.

So to resolve, I changed the params being passed into this call and the zoom:

driver.execute_script("document.body.style.zoom='90%'")
params = {'landscape': False, 'paperWidth': 12, 'paperHeight': 25}
data = driver.execute_cdp_cmd("Page.printToPDF", params)

To resize the PDF for rendering it on GitHub, I added:

import fitz  # PyMuPDF

src = fitz.open("course_1/week_1/practical_quiz_1.pdf")  # Open source PDF
doc = fitz.open()  # Create a new PDF document

for ipage in src:
    rect = ipage.rect  # Get original page dimensions
    crop_rect = fitz.Rect(rect.x0 + 30, rect.y0, rect.x1 - 30, rect.y1)  # Adjust left/right padding

    if rect.width > rect.height:
        fmt = fitz.paper_size("a4-l")  # Landscape A4
    else:
        fmt = fitz.paper_size("a4")  # Portrait A4
    
    page = doc.new_page(width=fmt[0], height=fmt[1])  # Create new page
    page.show_pdf_page(page.rect, src, ipage.number, clip=crop_rect)  # Apply cropping

doc.save("course_1/week_1/practical_quiz_1_a4.pdf")
doc.close()

This seemed to do the trick.

Code: https://github/psymbio/math_ml/blob/main/coursera_pdf_maker.ipynb

PDF Custom: https://github/psymbio/math_ml/blob/main/course_1/week_1/practical_quiz_1.pdf

PDF A4: https://github/psymbio/math_ml/blob/main/course_1/week_1/practical_quiz_1_a4.pdf

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

How to create a searchable PDF using Python and Selenium? - Stack Overflow

1 Answer 1

与本文相关的文章

评论列表(0)