crawl4ai/tests/hub/test_simple.py

# test.py
from crawl4ai import CrawlerHub
import json

async def amazon_example():
    if (crawler_cls := CrawlerHub.get("amazon_product")) :
        crawler = crawler_cls()
        print(f"Crawler version: {crawler_cls.meta['version']}")
        print(f"Rate limits: {crawler_cls.meta.get('rate_limit', 'Unlimited')}")
        print(await crawler.run("https://amazon.com/test"))
    else:
        print("Crawler not found!")

async def google_example():
    # Get crawler dynamically
    crawler_cls = CrawlerHub.get("google_search")
    crawler = crawler_cls()

    # Text search
    text_results = await crawler.run(
        query="apple inc", 
        search_type="text",  
        schema_cache_path="/Users/unclecode/.crawl4ai"
    )
    print(json.dumps(json.loads(text_results), indent=4))

    # Image search
    # image_results = await crawler.run(query="apple inc", search_type="image")
    # print(image_results)

if __name__ == "__main__":
    import asyncio
    # asyncio.run(amazon_example())
    asyncio.run(google_example())
refactor(core): reorganize project structure and remove legacy code Major reorganization of the project structure: - Moved legacy synchronous crawler code to legacy folder - Removed deprecated CLI and docs manager - Consolidated version manager into utils.py - Added CrawlerHub to __init__.py exports - Fixed type hints in async_webcrawler.py - Fixed minor bugs in chunking and crawler strategies BREAKING CHANGE: Removed synchronous WebCrawler, CLI, and docs management functionality. Users should migrate to AsyncWebCrawler. 2025-01-30 19:35:06 +08:00			`# test.py`
			`from crawl4ai import CrawlerHub`
			`import json`

			`async def amazon_example():`
			`if (crawler_cls := CrawlerHub.get("amazon_product")) :`
			`crawler = crawler_cls()`
			`print(f"Crawler version: {crawler_cls.meta['version']}")`
			`print(f"Rate limits: {crawler_cls.meta.get('rate_limit', 'Unlimited')}")`
			`print(await crawler.run("https://amazon.com/test"))`
			`else:`
			`print("Crawler not found!")`

			`async def google_example():`
			`# Get crawler dynamically`
			`crawler_cls = CrawlerHub.get("google_search")`
			`crawler = crawler_cls()`

			`# Text search`
refactor(crawler): improve HTML handling and cleanup codebase - Add HTML attribute preservation in GoogleSearchCrawler - Fix lxml import references in utils.py - Remove unused ssl_certificate.json - Clean up imports and code organization in hub.py - Update test case formatting and remove unused image search test BREAKING CHANGE: Removed ssl_certificate.json file which might affect existing certificate validations 2025-02-07 21:56:27 +08:00			`text_results = await crawler.run(`
			`query="apple inc",`
			`search_type="text",`
			`schema_cache_path="/Users/unclecode/.crawl4ai"`
			`)`
			`print(json.dumps(json.loads(text_results), indent=4))`
refactor(core): reorganize project structure and remove legacy code Major reorganization of the project structure: - Moved legacy synchronous crawler code to legacy folder - Removed deprecated CLI and docs manager - Consolidated version manager into utils.py - Added CrawlerHub to __init__.py exports - Fixed type hints in async_webcrawler.py - Fixed minor bugs in chunking and crawler strategies BREAKING CHANGE: Removed synchronous WebCrawler, CLI, and docs management functionality. Users should migrate to AsyncWebCrawler. 2025-01-30 19:35:06 +08:00
			`# Image search`
refactor(crawler): improve HTML handling and cleanup codebase - Add HTML attribute preservation in GoogleSearchCrawler - Fix lxml import references in utils.py - Remove unused ssl_certificate.json - Clean up imports and code organization in hub.py - Update test case formatting and remove unused image search test BREAKING CHANGE: Removed ssl_certificate.json file which might affect existing certificate validations 2025-02-07 21:56:27 +08:00			`# image_results = await crawler.run(query="apple inc", search_type="image")`
			`# print(image_results)`
refactor(core): reorganize project structure and remove legacy code Major reorganization of the project structure: - Moved legacy synchronous crawler code to legacy folder - Removed deprecated CLI and docs manager - Consolidated version manager into utils.py - Added CrawlerHub to __init__.py exports - Fixed type hints in async_webcrawler.py - Fixed minor bugs in chunking and crawler strategies BREAKING CHANGE: Removed synchronous WebCrawler, CLI, and docs management functionality. Users should migrate to AsyncWebCrawler. 2025-01-30 19:35:06 +08:00
			`if __name__ == "__main__":`
			`import asyncio`
			`# asyncio.run(amazon_example())`
			`asyncio.run(google_example())`