chore: Add custom headers to LocalSeleniumCrawlerStrategy

This commit is contained in:
unclecode 2024-06-17 15:50:03 +08:00
parent 9a97aacd85
commit 77da48050d
2 changed files with 24 additions and 8 deletions

View File

@ -139,6 +139,13 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
self.driver = webdriver.Chrome(service=self.service, options=self.options)
self.driver = self.execute_hook('on_driver_created', self.driver)
def set_custom_headers(self, headers: dict):
# Enable Network domain for sending headers
self.driver.execute_cdp_cmd('Network.enable', {})
# Set extra HTTP headers
self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers})
def crawl(self, url: str) -> str:
# Create md5 hash of the URL
import hashlib

View File

@ -198,12 +198,11 @@ def using_crawler_hooks(crawler):
print("[HOOK] on_driver_created")
# Example customization: maximize the window
driver.maximize_window()
return driver
def before_get_url(driver):
print("[HOOK] before_get_url")
# Example customization: logging in to a hypothetical website
driver.get('https://example.com/login')
from selenium.webdriver.support.ui import WebDriverWait
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.NAME, 'username'))
)
@ -215,8 +214,18 @@ def using_crawler_hooks(crawler):
)
# Add a custom cookie
driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
return driver
return driver
def before_get_url(driver):
print("[HOOK] before_get_url")
# Example customization: add a custom header
# Enable Network domain for sending headers
driver.execute_cdp_cmd('Network.enable', {})
# Add a custom header
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
return driver
def after_get_url(driver):
print("[HOOK] after_get_url")
# Example customization: log the URL
@ -225,9 +234,9 @@ def using_crawler_hooks(crawler):
def before_return_html(driver, html):
print("[HOOK] before_return_html")
# Example customization: modify the HTML (for demonstration purposes)
modified_html = html.replace('Example Domain', 'Test Domain')
return driver, modified_html
# Example customization: log the HTML
print(len(html))
return driver
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)