mirror of
				https://github.com/infiniflow/ragflow.git
				synced 2025-11-04 03:39:41 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			81 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			81 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import re
 | 
						|
import json
 | 
						|
import base64
 | 
						|
 | 
						|
from selenium import webdriver
 | 
						|
from selenium.webdriver.chrome.options import Options
 | 
						|
from selenium.webdriver.chrome.service import Service
 | 
						|
from selenium.common.exceptions import TimeoutException
 | 
						|
from selenium.webdriver.support.ui import WebDriverWait
 | 
						|
from selenium.webdriver.support.expected_conditions import staleness_of
 | 
						|
from webdriver_manager.chrome import ChromeDriverManager
 | 
						|
from selenium.webdriver.common.by import By
 | 
						|
 | 
						|
 | 
						|
def html2pdf(
 | 
						|
        source: str,
 | 
						|
        timeout: int = 2,
 | 
						|
        install_driver: bool = True,
 | 
						|
        print_options: dict = {},
 | 
						|
):
 | 
						|
    result = __get_pdf_from_html(source, timeout, install_driver, print_options)
 | 
						|
    return result
 | 
						|
 | 
						|
 | 
						|
def __send_devtools(driver, cmd, params={}):
 | 
						|
    resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
 | 
						|
    url = driver.command_executor._url + resource
 | 
						|
    body = json.dumps({"cmd": cmd, "params": params})
 | 
						|
    response = driver.command_executor._request("POST", url, body)
 | 
						|
 | 
						|
    if not response:
 | 
						|
        raise Exception(response.get("value"))
 | 
						|
 | 
						|
    return response.get("value")
 | 
						|
 | 
						|
 | 
						|
def __get_pdf_from_html(
 | 
						|
        path: str,
 | 
						|
        timeout: int,
 | 
						|
        install_driver: bool,
 | 
						|
        print_options: dict
 | 
						|
):
 | 
						|
    webdriver_options = Options()
 | 
						|
    webdriver_prefs = {}
 | 
						|
    webdriver_options.add_argument("--headless")
 | 
						|
    webdriver_options.add_argument("--disable-gpu")
 | 
						|
    webdriver_options.add_argument("--no-sandbox")
 | 
						|
    webdriver_options.add_argument("--disable-dev-shm-usage")
 | 
						|
    webdriver_options.experimental_options["prefs"] = webdriver_prefs
 | 
						|
 | 
						|
    webdriver_prefs["profile.default_content_settings"] = {"images": 2}
 | 
						|
 | 
						|
    if install_driver:
 | 
						|
        service = Service(ChromeDriverManager().install())
 | 
						|
        driver = webdriver.Chrome(service=service, options=webdriver_options)
 | 
						|
    else:
 | 
						|
        driver = webdriver.Chrome(options=webdriver_options)
 | 
						|
 | 
						|
    driver.get(path)
 | 
						|
 | 
						|
    try:
 | 
						|
        WebDriverWait(driver, timeout).until(
 | 
						|
            staleness_of(driver.find_element(by=By.TAG_NAME, value="html"))
 | 
						|
        )
 | 
						|
    except TimeoutException:
 | 
						|
        calculated_print_options = {
 | 
						|
            "landscape": False,
 | 
						|
            "displayHeaderFooter": False,
 | 
						|
            "printBackground": True,
 | 
						|
            "preferCSSPageSize": True,
 | 
						|
        }
 | 
						|
        calculated_print_options.update(print_options)
 | 
						|
        result = __send_devtools(
 | 
						|
            driver, "Page.printToPDF", calculated_print_options)
 | 
						|
        driver.quit()
 | 
						|
        return base64.b64decode(result["data"])
 | 
						|
 | 
						|
 | 
						|
def is_valid_url(url: str) -> bool:
 | 
						|
    return bool(re.match(r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url))
 |