mirror of
https://github.com/unclecode/crawl4ai.git
synced 2025-12-30 03:48:54 +00:00
Create series of quickstart files.
This commit is contained in:
parent
5c15837677
commit
eb131bebdf
@ -12,7 +12,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
|
||||
|
||||
## Try it Now!
|
||||
|
||||
✨ Play around with this [](https://colab.research.google.com/drive/1sJPAmeLj5PMrg2VgOwMJ2ubGIcK0cJeX?usp=sharing)
|
||||
✨ Play around with this [](https://colab.research.google.com/drive/1REChY6fXQf-EaVYLv0eHEWvzlYxGm0pd?usp=sharing)
|
||||
|
||||
✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/)
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
242
docs/examples/quickstart_async.py
Normal file
242
docs/examples/quickstart_async.py
Normal file
@ -0,0 +1,242 @@
|
||||
import asyncio
|
||||
import time
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from pydantic import BaseModel, Field
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
|
||||
|
||||
print("Crawl4AI: Advanced Web Crawling and Data Extraction")
|
||||
print("GitHub Repository: https://github.com/unclecode/crawl4ai")
|
||||
print("Twitter: @unclecode")
|
||||
print("Website: https://crawl4ai.com")
|
||||
|
||||
async def simple_crawl():
|
||||
print("\n--- Basic Usage ---")
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(url="https://www.nbcnews.com/business")
|
||||
print(result.markdown[:500]) # Print first 500 characters
|
||||
|
||||
async def js_and_css():
|
||||
print("\n--- Executing JavaScript and Using CSS Selectors ---")
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
js_code = ["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"]
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js_code=js_code,
|
||||
css_selector="article.tease-card",
|
||||
bypass_cache=True
|
||||
)
|
||||
print(result.extracted_content[:500]) # Print first 500 characters
|
||||
|
||||
async def use_proxy():
|
||||
print("\n--- Using a Proxy ---")
|
||||
print("Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example.")
|
||||
# Uncomment and modify the following lines to use a proxy
|
||||
# async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler:
|
||||
# result = await crawler.arun(
|
||||
# url="https://www.nbcnews.com/business",
|
||||
# bypass_cache=True
|
||||
# )
|
||||
# print(result.markdown[:500]) # Print first 500 characters
|
||||
|
||||
class OpenAIModelFee(BaseModel):
|
||||
model_name: str = Field(..., description="Name of the OpenAI model.")
|
||||
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
||||
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
|
||||
|
||||
async def extract_openai_fees():
|
||||
print("\n--- Extracting Structured Data with OpenAI ---")
|
||||
print("Note: Set your OpenAI API key as an environment variable to run this example.")
|
||||
if not os.getenv('OPENAI_API_KEY'):
|
||||
print("OpenAI API key not found. Skipping this example.")
|
||||
return
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url='https://openai.com/api/pricing/',
|
||||
word_count_threshold=1,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'),
|
||||
schema=OpenAIModelFee.schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||
Do not miss any models in the entire content. One extracted model JSON format should look like this:
|
||||
{"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}."""
|
||||
),
|
||||
bypass_cache=True,
|
||||
)
|
||||
print(result.extracted_content)
|
||||
|
||||
async def crawl_typescript_commits():
|
||||
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
|
||||
first_commit = ""
|
||||
async def on_execution_started(page):
|
||||
nonlocal first_commit
|
||||
try:
|
||||
while True:
|
||||
await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')
|
||||
commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')
|
||||
commit = await commit.evaluate('(element) => element.textContent')
|
||||
commit = re.sub(r'\s+', '', commit)
|
||||
if commit and commit != first_commit:
|
||||
first_commit = commit
|
||||
break
|
||||
await asyncio.sleep(0.5)
|
||||
except Exception as e:
|
||||
print(f"Warning: New content didn't appear after JavaScript execution: {e}")
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)
|
||||
|
||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||
session_id = "typescript_commits_session"
|
||||
all_commits = []
|
||||
|
||||
js_next_page = """
|
||||
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
||||
if (button) button.click();
|
||||
"""
|
||||
|
||||
for page in range(3): # Crawl 3 pages
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
session_id=session_id,
|
||||
css_selector="li.Box-sc-g0xbh4-0",
|
||||
js=js_next_page if page > 0 else None,
|
||||
bypass_cache=True,
|
||||
js_only=page > 0
|
||||
)
|
||||
|
||||
assert result.success, f"Failed to crawl page {page + 1}"
|
||||
|
||||
soup = BeautifulSoup(result.cleaned_html, 'html.parser')
|
||||
commits = soup.select("li")
|
||||
all_commits.extend(commits)
|
||||
|
||||
print(f"Page {page + 1}: Found {len(commits)} commits")
|
||||
|
||||
await crawler.crawler_strategy.kill_session(session_id)
|
||||
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||
|
||||
async def extract_news_teasers():
|
||||
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
|
||||
schema = {
|
||||
"name": "News Teaser Extractor",
|
||||
"baseSelector": ".wide-tease-item__wrapper",
|
||||
"fields": [
|
||||
{
|
||||
"name": "category",
|
||||
"selector": ".unibrow span[data-testid='unibrow-text']",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "headline",
|
||||
"selector": ".wide-tease-item__headline",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "summary",
|
||||
"selector": ".wide-tease-item__description",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "time",
|
||||
"selector": "[data-testid='wide-tease-date']",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "image",
|
||||
"type": "nested",
|
||||
"selector": "picture.teasePicture img",
|
||||
"fields": [
|
||||
{"name": "src", "type": "attribute", "attribute": "src"},
|
||||
{"name": "alt", "type": "attribute", "attribute": "alt"},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "link",
|
||||
"selector": "a[href]",
|
||||
"type": "attribute",
|
||||
"attribute": "href",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=extraction_strategy,
|
||||
bypass_cache=True,
|
||||
)
|
||||
|
||||
assert result.success, "Failed to crawl the page"
|
||||
|
||||
news_teasers = json.loads(result.extracted_content)
|
||||
print(f"Successfully extracted {len(news_teasers)} news teasers")
|
||||
print(json.dumps(news_teasers[0], indent=2))
|
||||
|
||||
async def speed_comparison():
|
||||
print("\n--- Speed Comparison ---")
|
||||
print("Firecrawl (simulated):")
|
||||
print("Time taken: 7.02 seconds")
|
||||
print("Content length: 42074 characters")
|
||||
print("Images found: 49")
|
||||
print()
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Crawl4AI simple crawl
|
||||
start = time.time()
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
word_count_threshold=0,
|
||||
bypass_cache=True,
|
||||
verbose=False
|
||||
)
|
||||
end = time.time()
|
||||
print("Crawl4AI (simple crawl):")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
print(f"Content length: {len(result.markdown)} characters")
|
||||
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
|
||||
print()
|
||||
|
||||
# Crawl4AI with JavaScript execution
|
||||
start = time.time()
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
|
||||
word_count_threshold=0,
|
||||
bypass_cache=True,
|
||||
verbose=False
|
||||
)
|
||||
end = time.time()
|
||||
print("Crawl4AI (with JavaScript execution):")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
print(f"Content length: {len(result.markdown)} characters")
|
||||
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
|
||||
|
||||
print("\nNote on Speed Comparison:")
|
||||
print("The speed test conducted here may not reflect optimal conditions.")
|
||||
print("When we call Firecrawl's API, we're seeing its best performance,")
|
||||
print("while Crawl4AI's performance is limited by the local network speed.")
|
||||
print("For a more accurate comparison, it's recommended to run these tests")
|
||||
print("on servers with a stable and fast internet connection.")
|
||||
print("Despite these limitations, Crawl4AI still demonstrates faster performance.")
|
||||
print("If you run these tests in an environment with better network conditions,")
|
||||
print("you may observe an even more significant speed advantage for Crawl4AI.")
|
||||
|
||||
async def main():
|
||||
await simple_crawl()
|
||||
await js_and_css()
|
||||
await use_proxy()
|
||||
await extract_openai_fees()
|
||||
await crawl_typescript_commits()
|
||||
await extract_news_teasers()
|
||||
await speed_comparison()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Loading…
x
Reference in New Issue
Block a user