2024-12-29 20:05:18 +08:00
|
|
|
"""
|
|
|
|
This example demonstrates how to use JSON CSS extraction to scrape product information
|
|
|
|
from Amazon search results. It shows how to extract structured data like product titles,
|
|
|
|
prices, ratings, and other details using CSS selectors.
|
|
|
|
"""
|
|
|
|
|
|
|
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
|
|
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
|
|
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
|
|
|
import json
|
|
|
|
from playwright.async_api import Page, BrowserContext
|
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
async def extract_amazon_products():
|
|
|
|
# Initialize browser config
|
|
|
|
browser_config = BrowserConfig(
|
|
|
|
# browser_type="chromium",
|
|
|
|
headless=True
|
|
|
|
)
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
# Initialize crawler config with JSON CSS extraction strategy nav-search-submit-button
|
|
|
|
crawler_config = CrawlerRunConfig(
|
|
|
|
cache_mode=CacheMode.BYPASS,
|
|
|
|
extraction_strategy=JsonCssExtractionStrategy(
|
|
|
|
schema={
|
|
|
|
"name": "Amazon Product Search Results",
|
|
|
|
"baseSelector": "[data-component-type='s-search-result']",
|
|
|
|
"fields": [
|
|
|
|
{
|
|
|
|
"name": "asin",
|
|
|
|
"selector": "",
|
|
|
|
"type": "attribute",
|
2025-01-13 19:19:58 +08:00
|
|
|
"attribute": "data-asin",
|
2024-12-29 20:05:18 +08:00
|
|
|
},
|
2025-01-13 19:19:58 +08:00
|
|
|
{"name": "title", "selector": "h2 a span", "type": "text"},
|
2024-12-29 20:05:18 +08:00
|
|
|
{
|
|
|
|
"name": "url",
|
|
|
|
"selector": "h2 a",
|
|
|
|
"type": "attribute",
|
2025-01-13 19:19:58 +08:00
|
|
|
"attribute": "href",
|
2024-12-29 20:05:18 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"name": "image",
|
|
|
|
"selector": ".s-image",
|
|
|
|
"type": "attribute",
|
2025-01-13 19:19:58 +08:00
|
|
|
"attribute": "src",
|
2024-12-29 20:05:18 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"name": "rating",
|
|
|
|
"selector": ".a-icon-star-small .a-icon-alt",
|
2025-01-13 19:19:58 +08:00
|
|
|
"type": "text",
|
2024-12-29 20:05:18 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"name": "reviews_count",
|
|
|
|
"selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
|
2025-01-13 19:19:58 +08:00
|
|
|
"type": "text",
|
2024-12-29 20:05:18 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"name": "price",
|
|
|
|
"selector": ".a-price .a-offscreen",
|
2025-01-13 19:19:58 +08:00
|
|
|
"type": "text",
|
2024-12-29 20:05:18 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"name": "original_price",
|
|
|
|
"selector": ".a-price.a-text-price .a-offscreen",
|
2025-01-13 19:19:58 +08:00
|
|
|
"type": "text",
|
2024-12-29 20:05:18 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"name": "sponsored",
|
|
|
|
"selector": ".puis-sponsored-label-text",
|
2025-01-13 19:19:58 +08:00
|
|
|
"type": "exists",
|
2024-12-29 20:05:18 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"name": "delivery_info",
|
|
|
|
"selector": "[data-cy='delivery-recipe'] .a-color-base",
|
|
|
|
"type": "text",
|
2025-01-13 19:19:58 +08:00
|
|
|
"multiple": True,
|
|
|
|
},
|
|
|
|
],
|
2024-12-29 20:05:18 +08:00
|
|
|
}
|
2025-01-13 19:19:58 +08:00
|
|
|
),
|
2024-12-29 20:05:18 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
url = "https://www.amazon.com/"
|
2025-01-13 19:19:58 +08:00
|
|
|
|
|
|
|
async def after_goto(
|
|
|
|
page: Page, context: BrowserContext, url: str, response: dict, **kwargs
|
|
|
|
):
|
2024-12-29 20:05:18 +08:00
|
|
|
"""Hook called after navigating to each URL"""
|
|
|
|
print(f"[HOOK] after_goto - Successfully loaded: {url}")
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
try:
|
|
|
|
# Wait for search box to be available
|
2025-01-13 19:19:58 +08:00
|
|
|
search_box = await page.wait_for_selector(
|
|
|
|
"#twotabsearchtextbox", timeout=1000
|
|
|
|
)
|
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
# Type the search query
|
2025-01-13 19:19:58 +08:00
|
|
|
await search_box.fill("Samsung Galaxy Tab")
|
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
# Get the search button and prepare for navigation
|
2025-01-13 19:19:58 +08:00
|
|
|
search_button = await page.wait_for_selector(
|
|
|
|
"#nav-search-submit-button", timeout=1000
|
|
|
|
)
|
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
# Click with navigation waiting
|
|
|
|
await search_button.click()
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
# Wait for search results to load
|
2025-01-13 19:19:58 +08:00
|
|
|
await page.wait_for_selector(
|
|
|
|
'[data-component-type="s-search-result"]', timeout=10000
|
|
|
|
)
|
2024-12-29 20:05:18 +08:00
|
|
|
print("[HOOK] Search completed and results loaded!")
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
except Exception as e:
|
|
|
|
print(f"[HOOK] Error during search operation: {str(e)}")
|
2025-01-13 19:19:58 +08:00
|
|
|
|
|
|
|
return page
|
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
# Use context manager for proper resource handling
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
|
|
crawler.crawler_strategy.set_hook("after_goto", after_goto)
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
# Extract the data
|
|
|
|
result = await crawler.arun(url=url, config=crawler_config)
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
# Process and print the results
|
|
|
|
if result and result.extracted_content:
|
|
|
|
# Parse the JSON string into a list of products
|
|
|
|
products = json.loads(result.extracted_content)
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
# Process each product in the list
|
|
|
|
for product in products:
|
|
|
|
print("\nProduct Details:")
|
|
|
|
print(f"ASIN: {product.get('asin')}")
|
|
|
|
print(f"Title: {product.get('title')}")
|
|
|
|
print(f"Price: {product.get('price')}")
|
|
|
|
print(f"Original Price: {product.get('original_price')}")
|
|
|
|
print(f"Rating: {product.get('rating')}")
|
|
|
|
print(f"Reviews: {product.get('reviews_count')}")
|
|
|
|
print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
|
2025-01-13 19:19:58 +08:00
|
|
|
if product.get("delivery_info"):
|
2024-12-29 20:05:18 +08:00
|
|
|
print(f"Delivery: {' '.join(product['delivery_info'])}")
|
|
|
|
print("-" * 80)
|
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
if __name__ == "__main__":
|
|
|
|
import asyncio
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-12-29 20:05:18 +08:00
|
|
|
asyncio.run(extract_amazon_products())
|