mirror of
				https://github.com/infiniflow/ragflow.git
				synced 2025-10-31 01:40:20 +00:00 
			
		
		
		
	
		
			
	
	
		
			81 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			81 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
|   | import re | ||
|  | import json | ||
|  | import base64 | ||
|  | 
 | ||
|  | from selenium import webdriver | ||
|  | from selenium.webdriver.chrome.options import Options | ||
|  | from selenium.webdriver.chrome.service import Service | ||
|  | from selenium.common.exceptions import TimeoutException | ||
|  | from selenium.webdriver.support.ui import WebDriverWait | ||
|  | from selenium.webdriver.support.expected_conditions import staleness_of | ||
|  | from webdriver_manager.chrome import ChromeDriverManager | ||
|  | from selenium.webdriver.common.by import By | ||
|  | 
 | ||
|  | 
 | ||
|  | def html2pdf( | ||
|  |         source: str, | ||
|  |         timeout: int = 2, | ||
|  |         install_driver: bool = True, | ||
|  |         print_options: dict = {}, | ||
|  | ): | ||
|  |     result = __get_pdf_from_html(source, timeout, install_driver, print_options) | ||
|  |     return result | ||
|  | 
 | ||
|  | 
 | ||
|  | def __send_devtools(driver, cmd, params={}): | ||
|  |     resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id | ||
|  |     url = driver.command_executor._url + resource | ||
|  |     body = json.dumps({"cmd": cmd, "params": params}) | ||
|  |     response = driver.command_executor._request("POST", url, body) | ||
|  | 
 | ||
|  |     if not response: | ||
|  |         raise Exception(response.get("value")) | ||
|  | 
 | ||
|  |     return response.get("value") | ||
|  | 
 | ||
|  | 
 | ||
|  | def __get_pdf_from_html( | ||
|  |         path: str, | ||
|  |         timeout: int, | ||
|  |         install_driver: bool, | ||
|  |         print_options: dict | ||
|  | ): | ||
|  |     webdriver_options = Options() | ||
|  |     webdriver_prefs = {} | ||
|  |     webdriver_options.add_argument("--headless") | ||
|  |     webdriver_options.add_argument("--disable-gpu") | ||
|  |     webdriver_options.add_argument("--no-sandbox") | ||
|  |     webdriver_options.add_argument("--disable-dev-shm-usage") | ||
|  |     webdriver_options.experimental_options["prefs"] = webdriver_prefs | ||
|  | 
 | ||
|  |     webdriver_prefs["profile.default_content_settings"] = {"images": 2} | ||
|  | 
 | ||
|  |     if install_driver: | ||
|  |         service = Service(ChromeDriverManager().install()) | ||
|  |         driver = webdriver.Chrome(service=service, options=webdriver_options) | ||
|  |     else: | ||
|  |         driver = webdriver.Chrome(options=webdriver_options) | ||
|  | 
 | ||
|  |     driver.get(path) | ||
|  | 
 | ||
|  |     try: | ||
|  |         WebDriverWait(driver, timeout).until( | ||
|  |             staleness_of(driver.find_element(by=By.TAG_NAME, value="html")) | ||
|  |         ) | ||
|  |     except TimeoutException: | ||
|  |         calculated_print_options = { | ||
|  |             "landscape": False, | ||
|  |             "displayHeaderFooter": False, | ||
|  |             "printBackground": True, | ||
|  |             "preferCSSPageSize": True, | ||
|  |         } | ||
|  |         calculated_print_options.update(print_options) | ||
|  |         result = __send_devtools( | ||
|  |             driver, "Page.printToPDF", calculated_print_options) | ||
|  |         driver.quit() | ||
|  |         return base64.b64decode(result["data"]) | ||
|  | 
 | ||
|  | 
 | ||
|  | def is_valid_url(url: str) -> bool: | ||
|  |     return bool(re.match(r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url)) |