
* fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
306 lines
11 KiB
Python
306 lines
11 KiB
Python
import asyncio
|
|
import json
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from regex import P
|
|
from crawl4ai import (
|
|
AsyncWebCrawler,
|
|
BrowserConfig,
|
|
CrawlerRunConfig,
|
|
CacheMode,
|
|
LLMExtractionStrategy,
|
|
JsonCssExtractionStrategy,
|
|
CrawlerHub,
|
|
CrawlResult,
|
|
DefaultMarkdownGenerator,
|
|
PruningContentFilter,
|
|
)
|
|
from pathlib import Path
|
|
from pydantic import BaseModel
|
|
|
|
__current_dir = Path(__file__).parent
|
|
|
|
# Crawl4ai Hello Web
|
|
async def little_hello_web():
|
|
async with AsyncWebCrawler() as crawler:
|
|
result : CrawlResult = await crawler.arun(
|
|
url="https://www.helloworld.org"
|
|
)
|
|
print(result.markdown.raw_markdown[:500])
|
|
|
|
async def hello_web():
|
|
browser_config = BrowserConfig(headless=True, verbose=True)
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
crawler_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
markdown_generator=DefaultMarkdownGenerator(
|
|
content_filter=PruningContentFilter(
|
|
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
|
)
|
|
),
|
|
)
|
|
result : CrawlResult = await crawler.arun(
|
|
url="https://www.helloworld.org", config=crawler_config
|
|
)
|
|
print(result.markdown.fit_markdown[:500])
|
|
|
|
# Naive Approach Using Large Language Models
|
|
async def extract_using_llm():
|
|
print("Extracting using Large Language Models")
|
|
|
|
browser_config = BrowserConfig(headless=True, verbose=True)
|
|
crawler = AsyncWebCrawler(config=browser_config)
|
|
|
|
await crawler.start()
|
|
try:
|
|
class Sitelink(BaseModel):
|
|
title: str
|
|
link: str
|
|
|
|
class GoogleSearchResult(BaseModel):
|
|
title: str
|
|
link: str
|
|
snippet: str
|
|
sitelinks: Optional[List[Sitelink]] = None
|
|
|
|
llm_extraction_strategy = LLMExtractionStrategy(
|
|
provider = "openai/gpt-4o",
|
|
schema = GoogleSearchResult.model_json_schema(),
|
|
instruction="""I want to extract the title, link, snippet, and sitelinks from a Google search result. I shared here the content of div#search from the search result page. We are just interested in organic search results.
|
|
Example:
|
|
{
|
|
"title": "Google",
|
|
"link": "https://www.google.com",
|
|
"snippet": "Google is a search engine.",
|
|
"sitelinks": [
|
|
{
|
|
"title": "Gmail",
|
|
"link": "https://mail.google.com"
|
|
},
|
|
{
|
|
"title": "Google Drive",
|
|
"link": "https://drive.google.com"
|
|
}
|
|
]
|
|
}""",
|
|
# apply_chunking=False,
|
|
chunk_token_threshold=2 ** 12, # 2^12 = 4096
|
|
verbose=True,
|
|
# input_format="html", # html, markdown, cleaned_html
|
|
input_format="cleaned_html"
|
|
)
|
|
|
|
|
|
crawl_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
keep_attrs=["id", "class"],
|
|
keep_data_attributes=True,
|
|
delay_before_return_html=2,
|
|
extraction_strategy=llm_extraction_strategy,
|
|
css_selector="div#search",
|
|
)
|
|
|
|
result : CrawlResult = await crawler.arun(
|
|
url="https://www.google.com/search?q=apple%20inc&start=0&num=10",
|
|
config=crawl_config,
|
|
)
|
|
|
|
search_result = {}
|
|
if result.success:
|
|
search_result = json.loads(result.extracted_content)
|
|
|
|
# save search result to file
|
|
with open(__current_dir / "search_result_using_llm.json", "w") as f:
|
|
f.write(json.dumps(search_result, indent=4))
|
|
print(json.dumps(search_result, indent=4))
|
|
|
|
finally:
|
|
await crawler.close()
|
|
|
|
# Example of using CrawlerHub
|
|
async def schema_generator():
|
|
print("Generating schema")
|
|
html = ""
|
|
|
|
# Load html from file
|
|
with open(__current_dir / "google_search_item.html", "r") as f:
|
|
html = f.read()
|
|
|
|
organic_schema = JsonCssExtractionStrategy.generate_schema(
|
|
html=html,
|
|
target_json_example="""{
|
|
"title": "...",
|
|
"link": "...",
|
|
"snippet": "...",
|
|
"date": "1 hour ago",
|
|
"sitelinks": [
|
|
{
|
|
"title": "...",
|
|
"link": "..."
|
|
}
|
|
]
|
|
}""",
|
|
query="""The given HTML is the crawled HTML from the Google search result, which refers to one HTML element representing one organic Google search result. Please find the schema for the organic search item based on the given HTML. I am interested in the title, link, snippet text, sitelinks, and date.""",
|
|
)
|
|
|
|
print(json.dumps(organic_schema, indent=4))
|
|
pass
|
|
|
|
# Golden Standard
|
|
async def build_schema(html:str, force: bool = False) -> Dict[str, Any]:
|
|
print("Building schema")
|
|
schemas = {}
|
|
if (__current_dir / "organic_schema.json").exists() and not force:
|
|
with open(__current_dir / "organic_schema.json", "r") as f:
|
|
schemas["organic"] = json.loads(f.read())
|
|
else:
|
|
# Extract schema from html
|
|
organic_schema = JsonCssExtractionStrategy.generate_schema(
|
|
html=html,
|
|
target_json_example="""{
|
|
"title": "...",
|
|
"link": "...",
|
|
"snippet": "...",
|
|
"date": "1 hour ago",
|
|
"sitelinks": [
|
|
{
|
|
"title": "...",
|
|
"link": "..."
|
|
}
|
|
]
|
|
}""",
|
|
query="""The given html is the crawled html from Google search result. Please find the schema for organic search item in the given html, I am interested in title, link, snippet text, sitelinks and date. Usually they are all inside a div#search.""",
|
|
)
|
|
|
|
# Save schema to file current_dir/organic_schema.json
|
|
with open(__current_dir / "organic_schema.json", "w") as f:
|
|
f.write(json.dumps(organic_schema, indent=4))
|
|
|
|
schemas["organic"] = organic_schema
|
|
|
|
# Repeat the same for top_stories_schema
|
|
if (__current_dir / "top_stories_schema.json").exists():
|
|
with open(__current_dir / "top_stories_schema.json", "r") as f:
|
|
schemas["top_stories"] = json.loads(f.read())
|
|
else:
|
|
top_stories_schema = JsonCssExtractionStrategy.generate_schema(
|
|
html=html,
|
|
target_json_example="""{
|
|
"title": "...",
|
|
"link": "...",
|
|
"source": "Insider Monkey",
|
|
"date": "1 hour ago",
|
|
}""",
|
|
query="""The given HTML is the crawled HTML from the Google search result. Please find the schema for the Top Stories item in the given HTML. I am interested in the title, link, source, and date.""",
|
|
)
|
|
|
|
with open(__current_dir / "top_stories_schema.json", "w") as f:
|
|
f.write(json.dumps(top_stories_schema, indent=4))
|
|
|
|
schemas["top_stories"] = top_stories_schema
|
|
|
|
# Repeat the same for suggested_queries_schema
|
|
if (__current_dir / "suggested_queries_schema.json").exists():
|
|
with open(__current_dir / "suggested_queries_schema.json", "r") as f:
|
|
schemas["suggested_queries"] = json.loads(f.read())
|
|
else:
|
|
suggested_queries_schema = JsonCssExtractionStrategy.generate_schema(
|
|
html=html,
|
|
target_json_example="""{
|
|
"query": "A for Apple",
|
|
}""",
|
|
query="""The given HTML contains the crawled HTML from Google search results. Please find the schema for each suggested query in the section "relatedSearches" at the bottom of the page. I am interested in the queries only.""",
|
|
)
|
|
|
|
with open(__current_dir / "suggested_queries_schema.json", "w") as f:
|
|
f.write(json.dumps(suggested_queries_schema, indent=4))
|
|
|
|
schemas["suggested_queries"] = suggested_queries_schema
|
|
|
|
return schemas
|
|
|
|
async def search(q: str = "apple inc") -> Dict[str, Any]:
|
|
print("Searching for:", q)
|
|
|
|
browser_config = BrowserConfig(headless=True, verbose=True)
|
|
crawler = AsyncWebCrawler(config=browser_config)
|
|
search_result: Dict[str, List[Dict[str, Any]]] = {}
|
|
|
|
await crawler.start()
|
|
try:
|
|
crawl_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
keep_attrs=["id", "class"],
|
|
keep_data_attributes=True,
|
|
delay_before_return_html=2,
|
|
)
|
|
from urllib.parse import quote
|
|
result: CrawlResult = await crawler.arun(
|
|
f"https://www.google.com/search?q={quote(q)}&start=0&num=10",
|
|
config=crawl_config
|
|
)
|
|
|
|
if result.success:
|
|
schemas : Dict[str, Any] = await build_schema(result.html)
|
|
|
|
for schema in schemas.values():
|
|
schema_key = schema["name"].lower().replace(' ', '_')
|
|
search_result[schema_key] = JsonCssExtractionStrategy(
|
|
schema=schema
|
|
).run(
|
|
url="",
|
|
sections=[result.html],
|
|
)
|
|
|
|
# save search result to file
|
|
with open(__current_dir / "search_result.json", "w") as f:
|
|
f.write(json.dumps(search_result, indent=4))
|
|
print(json.dumps(search_result, indent=4))
|
|
|
|
finally:
|
|
await crawler.close()
|
|
|
|
return search_result
|
|
|
|
# Example of using CrawlerHub
|
|
async def hub_example(query: str = "apple inc"):
|
|
print("Using CrawlerHub")
|
|
crawler_cls = CrawlerHub.get("google_search")
|
|
crawler = crawler_cls()
|
|
|
|
# Text search
|
|
text_results = await crawler.run(
|
|
query=query,
|
|
search_type="text",
|
|
schema_cache_path="/Users/unclecode/.crawl4ai"
|
|
)
|
|
# Save search result to file
|
|
with open(__current_dir / "search_result_using_hub.json", "w") as f:
|
|
f.write(json.dumps(json.loads(text_results), indent=4))
|
|
|
|
print(json.dumps(json.loads(text_results), indent=4))
|
|
|
|
|
|
async def demo():
|
|
# Step 1: Introduction & Overview
|
|
# await little_hello_web()
|
|
# await hello_web()
|
|
|
|
# Step 2: Demo end result, using hub
|
|
# await hub_example()
|
|
|
|
# Step 3: Using LLm for extraction
|
|
# await extract_using_llm()
|
|
|
|
# Step 4: GEt familiar with schema generation
|
|
# await schema_generator()
|
|
|
|
# Step 5: Golden Standard
|
|
# await search()
|
|
|
|
# Step 6: Introduction to CrawlerHub
|
|
await hub_example()
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(demo())
|