
Add support for controlling browser geolocation, locale and timezone settings: - New GeolocationConfig class for managing GPS coordinates - Add locale and timezone_id parameters to CrawlerRunConfig - Update browser context creation to handle location settings - Add example script for geolocation usage - Update documentation with location-based identity features This enables more precise control over browser identity and location reporting.
71 lines
2.4 KiB
Python
71 lines
2.4 KiB
Python
# use_geo_location.py
|
|
"""
|
|
Example: override locale, timezone, and geolocation using Crawl4ai patterns.
|
|
|
|
This demo uses `AsyncWebCrawler.arun()` to fetch a page with
|
|
browser context primed for specific locale, timezone, and GPS,
|
|
and saves a screenshot for visual verification.
|
|
"""
|
|
|
|
import asyncio
|
|
import base64
|
|
from pathlib import Path
|
|
from typing import List
|
|
from crawl4ai import (
|
|
AsyncWebCrawler,
|
|
CrawlerRunConfig,
|
|
BrowserConfig,
|
|
GeolocationConfig,
|
|
CrawlResult,
|
|
)
|
|
|
|
async def demo_geo_override():
|
|
"""Demo: Crawl a geolocation-test page with overrides and screenshot."""
|
|
print("\n=== Geo-Override Crawl ===")
|
|
|
|
# 1) Browser setup: use Playwright-managed contexts
|
|
browser_cfg = BrowserConfig(
|
|
headless=False,
|
|
viewport_width=1280,
|
|
viewport_height=720,
|
|
use_managed_browser=False,
|
|
)
|
|
|
|
# 2) Run config: include locale, timezone_id, geolocation, and screenshot
|
|
run_cfg = CrawlerRunConfig(
|
|
url="https://browserleaks.com/geo", # test page that shows your location
|
|
locale="en-US", # Accept-Language & UI locale
|
|
timezone_id="America/Los_Angeles", # JS Date()/Intl timezone
|
|
geolocation=GeolocationConfig( # override GPS coords
|
|
latitude=34.0522,
|
|
longitude=-118.2437,
|
|
accuracy=10.0,
|
|
),
|
|
screenshot=True, # capture screenshot after load
|
|
session_id="geo_test", # reuse context if rerunning
|
|
delay_before_return_html=5
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
|
# 3) Run crawl (returns list even for single URL)
|
|
results: List[CrawlResult] = await crawler.arun(
|
|
url=run_cfg.url,
|
|
config=run_cfg,
|
|
)
|
|
result = results[0]
|
|
|
|
# 4) Save screenshot and report path
|
|
if result.screenshot:
|
|
__current_dir = Path(__file__).parent
|
|
out_dir = __current_dir / "tmp"
|
|
out_dir.mkdir(exist_ok=True)
|
|
shot_path = out_dir / "geo_test.png"
|
|
with open(shot_path, "wb") as f:
|
|
f.write(base64.b64decode(result.screenshot))
|
|
print(f"Saved screenshot to {shot_path}")
|
|
else:
|
|
print("No screenshot captured, check configuration.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(demo_geo_override())
|