crawl4ai/docs/examples/hooks_example.py

from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from playwright.async_api import Page, BrowserContext

async def main():
    print("🔗 Hooks Example: Demonstrating different hook use cases")

    # Configure browser settings
    browser_config = BrowserConfig(
        headless=True
    )
    
    # Configure crawler settings
    crawler_run_config = CrawlerRunConfig(
        js_code="window.scrollTo(0, document.body.scrollHeight);",
        wait_for="body",
        cache_mode=CacheMode.BYPASS
    )

    # Create crawler instance
    crawler = AsyncWebCrawler(config=browser_config)

    # Define and set hook functions
    async def on_browser_created(browser, context: BrowserContext, **kwargs):
        """Hook called after the browser is created"""
        print("[HOOK] on_browser_created - Browser is ready!")
        # Example: Set a cookie that will be used for all requests
        return browser

    async def on_page_context_created(page: Page, context: BrowserContext, **kwargs):
        """Hook called after a new page and context are created"""
        print("[HOOK] on_page_context_created - New page created!")
        # Example: Set default viewport size
        await context.add_cookies([{
            'name': 'session_id',
            'value': 'example_session',
            'domain': '.example.com',
            'path': '/'
        }])
        await page.set_viewport_size({"width": 1080, "height": 800})
        return page

    async def on_user_agent_updated(page: Page, context: BrowserContext, user_agent: str, **kwargs):
        """Hook called when the user agent is updated"""
        print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}")
        return page

    async def on_execution_started(page: Page, context: BrowserContext, **kwargs):
        """Hook called after custom JavaScript execution"""
        print("[HOOK] on_execution_started - Custom JS executed!")
        return page

    async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs):
        """Hook called before navigating to each URL"""
        print(f"[HOOK] before_goto - About to visit: {url}")
        # Example: Add custom headers for the request
        await page.set_extra_http_headers({
            "Custom-Header": "my-value"
        })
        return page

    async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):
        """Hook called after navigating to each URL"""
        print(f"[HOOK] after_goto - Successfully loaded: {url}")
        # Example: Wait for a specific element to be loaded
        try:
            await page.wait_for_selector('.content', timeout=1000)
            print("Content element found!")
        except:
            print("Content element not found, continuing anyway")
        return page

    async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):
        """Hook called before retrieving the HTML content"""
        print("[HOOK] before_retrieve_html - About to get HTML content")
        # Example: Scroll to bottom to trigger lazy loading
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
        return page

    async def before_return_html(page: Page, context: BrowserContext, html:str, **kwargs):
        """Hook called before returning the HTML content"""
        print(f"[HOOK] before_return_html - Got HTML content (length: {len(html)})")
        # Example: You could modify the HTML content here if needed
        return page

    # Set all the hooks
    crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
    crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
    crawler.crawler_strategy.set_hook("on_user_agent_updated", on_user_agent_updated)
    crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
    crawler.crawler_strategy.set_hook("before_goto", before_goto)
    crawler.crawler_strategy.set_hook("after_goto", after_goto)
    crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html)
    crawler.crawler_strategy.set_hook("before_return_html", before_return_html)

    await crawler.start()

    # Example usage: crawl a simple website
    url = 'https://example.com'
    result = await crawler.arun(url, config=crawler_run_config)
    print(f"\nCrawled URL: {result.url}")
    print(f"HTML length: {len(result.html)}")
    
    await crawler.close()

if __name__ == "__main__":
    import asyncio
    asyncio.run(main())
Renames browser_config param to config in AsyncWebCrawler Standardizes parameter naming convention across the codebase by renaming browser_config to the more concise config in AsyncWebCrawler constructor. Updates all documentation examples and internal usages to reflect the new parameter name for consistency. Also improves hook execution by adding url/response parameters to goto hooks and fixes parameter ordering in before_return_html hook. 2024-12-26 16:34:36 +08:00			`from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode`
			`from playwright.async_api import Page, BrowserContext`

			`async def main():`
			`print("🔗 Hooks Example: Demonstrating different hook use cases")`

			`# Configure browser settings`
			`browser_config = BrowserConfig(`
			`headless=True`
			`)`

			`# Configure crawler settings`
			`crawler_run_config = CrawlerRunConfig(`
			`js_code="window.scrollTo(0, document.body.scrollHeight);",`
			`wait_for="body",`
			`cache_mode=CacheMode.BYPASS`
			`)`

			`# Create crawler instance`
			`crawler = AsyncWebCrawler(config=browser_config)`

			`# Define and set hook functions`
			`async def on_browser_created(browser, context: BrowserContext, **kwargs):`
			`"""Hook called after the browser is created"""`
			`print("[HOOK] on_browser_created - Browser is ready!")`
			`# Example: Set a cookie that will be used for all requests`
			`return browser`

			`async def on_page_context_created(page: Page, context: BrowserContext, **kwargs):`
			`"""Hook called after a new page and context are created"""`
			`print("[HOOK] on_page_context_created - New page created!")`
			`# Example: Set default viewport size`
			`await context.add_cookies([{`
			`'name': 'session_id',`
			`'value': 'example_session',`
			`'domain': '.example.com',`
			`'path': '/'`
			`}])`
refactor(docs): reorganize documentation structure and update styles Reorganize documentation into core/advanced/extraction sections for better navigation. Update terminal theme styles and add rich library for better CLI output. Remove redundant tutorial files and consolidate content into core sections. Add personal story to index page for project context. BREAKING CHANGE: Documentation structure has been significantly reorganized 2025-01-07 20:49:50 +08:00			`await page.set_viewport_size({"width": 1080, "height": 800})`
Renames browser_config param to config in AsyncWebCrawler Standardizes parameter naming convention across the codebase by renaming browser_config to the more concise config in AsyncWebCrawler constructor. Updates all documentation examples and internal usages to reflect the new parameter name for consistency. Also improves hook execution by adding url/response parameters to goto hooks and fixes parameter ordering in before_return_html hook. 2024-12-26 16:34:36 +08:00			`return page`

			`async def on_user_agent_updated(page: Page, context: BrowserContext, user_agent: str, **kwargs):`
			`"""Hook called when the user agent is updated"""`
			`print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}")`
			`return page`

			`async def on_execution_started(page: Page, context: BrowserContext, **kwargs):`
			`"""Hook called after custom JavaScript execution"""`
			`print("[HOOK] on_execution_started - Custom JS executed!")`
			`return page`

			`async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs):`
			`"""Hook called before navigating to each URL"""`
			`print(f"[HOOK] before_goto - About to visit: {url}")`
			`# Example: Add custom headers for the request`
			`await page.set_extra_http_headers({`
			`"Custom-Header": "my-value"`
			`})`
			`return page`

			`async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):`
			`"""Hook called after navigating to each URL"""`
			`print(f"[HOOK] after_goto - Successfully loaded: {url}")`
			`# Example: Wait for a specific element to be loaded`
			`try:`
			`await page.wait_for_selector('.content', timeout=1000)`
			`print("Content element found!")`
			`except:`
			`print("Content element not found, continuing anyway")`
			`return page`

			`async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):`
			`"""Hook called before retrieving the HTML content"""`
			`print("[HOOK] before_retrieve_html - About to get HTML content")`
			`# Example: Scroll to bottom to trigger lazy loading`
			`await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")`
			`return page`

			`async def before_return_html(page: Page, context: BrowserContext, html:str, **kwargs):`
			`"""Hook called before returning the HTML content"""`
			`print(f"[HOOK] before_return_html - Got HTML content (length: {len(html)})")`
			`# Example: You could modify the HTML content here if needed`
			`return page`

			`# Set all the hooks`
			`crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)`
			`crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)`
			`crawler.crawler_strategy.set_hook("on_user_agent_updated", on_user_agent_updated)`
			`crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)`
			`crawler.crawler_strategy.set_hook("before_goto", before_goto)`
			`crawler.crawler_strategy.set_hook("after_goto", after_goto)`
			`crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html)`
			`crawler.crawler_strategy.set_hook("before_return_html", before_return_html)`

			`await crawler.start()`

			`# Example usage: crawl a simple website`
			`url = 'https://example.com'`
			`result = await crawler.arun(url, config=crawler_run_config)`
			`print(f"\nCrawled URL: {result.url}")`
			`print(f"HTML length: {len(result.html)}")`

			`await crawler.close()`

			`if __name__ == "__main__":`
			`import asyncio`
			`asyncio.run(main())`