mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-06-27 00:41:33 +00:00
feat(apps/test-suite): add Rafa's index benchmark notebook
This commit is contained in:
parent
6d1b9bf1fe
commit
6e8873762a
360
apps/test-suite/index-benchmark/run.ipynb
Normal file
360
apps/test-suite/index-benchmark/run.ipynb
Normal file
@ -0,0 +1,360 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "51331c01",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Index Benchmark"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9cb10752",
|
||||
"metadata": {},
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7928e2c9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a64ec8a0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from firecrawl import FirecrawlApp, ScrapeOptions\n",
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from datetime import datetime\n",
|
||||
"import statistics\n",
|
||||
"import requests\n",
|
||||
"from time import sleep\n",
|
||||
"\n",
|
||||
"load_dotenv()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bc7ce797",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"app = FirecrawlApp(api_key=os.getenv(\"FIRECRAWL_API_KEY\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "440f7c2d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"scrape_urls=[\n",
|
||||
" 'https://news.ycombinator.com', # - Hacker News (simple, fast-loading)\n",
|
||||
" 'https://httpbin.org', # - HTTP testing service (very reliable)\n",
|
||||
" 'https://example.com', # - Standard test domain (minimal content)\n",
|
||||
" 'https://github.com/microsoft/vscode', # - GitHub repo page (structured content)\n",
|
||||
" 'https://stackoverflow.com/questions', # - Stack Overflow questions page\n",
|
||||
" 'https://www.wikipedia.org', # - Wikipedia main page (rich content)\n",
|
||||
" 'https://jsonplaceholder.typicode.com', # - Fake API for testing\n",
|
||||
" 'https://httpstat.us/200', # - HTTP status testing (minimal response)\n",
|
||||
" 'https://docs.python.org/3/' # - Python documentation (structured docs)\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"crawl_urls = [\n",
|
||||
" \"https://www.pcbgogo.com\", # 7825\n",
|
||||
" \"https://github.com/Uniswap/v4-core\", # 7353\n",
|
||||
" \"https://www.arcep.fr/actualites\", # 9764\n",
|
||||
" \"https://www.synapticure.com\", # 7746\n",
|
||||
" \"https://www.elecrow.com\", # 8025\n",
|
||||
" \"https://www.idfcfirstbank.com\", # 9912\n",
|
||||
" \"https://www.todaytix.com\", # 7532\n",
|
||||
" \"https://www.wheel-size.com\", # 7102\n",
|
||||
" \"https://drymerge.com\", # 8422\n",
|
||||
" \"https://telegramindex.org\" # 5335\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e54e6677",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Scrape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3fed4cb6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Hypothesis: Indexed scrapes are faster"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fb052d01",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"scrape_times_no_cache = []\n",
|
||||
"scrape_times_cached = []\n",
|
||||
"\n",
|
||||
"for i, url in enumerate(scrape_urls): # Test first 5 URLs\n",
|
||||
" print(f\"Testing {i+1}/{len(scrape_urls)}: {url}\")\n",
|
||||
" \n",
|
||||
" # No cache (maxAge=1)\n",
|
||||
" try:\n",
|
||||
" start = datetime.now()\n",
|
||||
" doc = app.scrape_url(url, maxAge=1)\n",
|
||||
" no_cache_time = (datetime.now() - start).total_seconds()\n",
|
||||
" scrape_times_no_cache.append(no_cache_time)\n",
|
||||
" print(f\" No cache: {no_cache_time:.2f}s ({doc.metadata['scrapeId']})\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\" No cache: FAILED - {e}\")\n",
|
||||
" scrape_times_no_cache.append(None)\n",
|
||||
" \n",
|
||||
" print(\" Waiting for cache to propagate...\")\n",
|
||||
" sleep(17)\n",
|
||||
" \n",
|
||||
" # Cached (maxAge=100000)\n",
|
||||
" try:\n",
|
||||
" start = datetime.now()\n",
|
||||
" doc = app.scrape_url(url, maxAge=100000)\n",
|
||||
" cached_time = (datetime.now() - start).total_seconds()\n",
|
||||
" scrape_times_cached.append(cached_time)\n",
|
||||
" print(f\" Cached: {cached_time:.2f}s ({doc.metadata['scrapeId']})\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\" Cached: FAILED - {e}\")\n",
|
||||
" scrape_times_cached.append(None)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7dce8a83",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Calculate averages\n",
|
||||
"valid_no_cache = [t for t in scrape_times_no_cache if t is not None]\n",
|
||||
"valid_cached = [t for t in scrape_times_cached if t is not None]\n",
|
||||
"\n",
|
||||
"if valid_no_cache and valid_cached:\n",
|
||||
" avg_no_cache = statistics.mean(valid_no_cache)\n",
|
||||
" avg_cached = statistics.mean(valid_cached)\n",
|
||||
" speedup = avg_no_cache / avg_cached if avg_cached > 0 else 0\n",
|
||||
" \n",
|
||||
" print(\"SCRAPE RESULTS:\")\n",
|
||||
" print(f\"Average no cache: {avg_no_cache:.2f}s\")\n",
|
||||
" print(f\"Average cached: {avg_cached:.2f}s\")\n",
|
||||
" print(f\"Speedup: {speedup:.1f}x faster with cache\")\n",
|
||||
" print(f\"Time saved: {avg_no_cache - avg_cached:.2f}s per request\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cc682ba4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "df801504",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Crawl"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "658374d7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"--- for now used to improve map "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5628c39d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Hypothesis: Indexed crawls are faster"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1482e163",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"crawl_times_no_cache = []\n",
|
||||
"crawl_times_cached = []\n",
|
||||
"\n",
|
||||
"for i, url in enumerate(crawl_urls):\n",
|
||||
" try:\n",
|
||||
" print(f\"Crawling {i+1}/{len(crawl_urls)}: {url}\")\n",
|
||||
" result = app.crawl_url(url)\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"{url[0]} - Crawl FAILED - {e}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7eb27685",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "abe3f30e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Map"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "683c74da",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Hypothesis: Indexed Map should get more urls after crawl"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f8d79207",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def map_request(url, ignore_index):\n",
|
||||
" \"\"\"\n",
|
||||
" Make a map request and return the links\n",
|
||||
" \"\"\"\n",
|
||||
" payload = {\"url\": url, \"useIndex\": not ignore_index, \"limit\": 30000}\n",
|
||||
" headers = {'Content-Type': 'application/json', \"Authorization\": \"Bearer no-auth\"}\n",
|
||||
" response = requests.post(\"https://api.firecrawl.dev/v1/map\", headers=headers, json=payload)\n",
|
||||
" \n",
|
||||
" if response.status_code == 200:\n",
|
||||
" data = response.json()\n",
|
||||
" return data.get('links', [])\n",
|
||||
" else:\n",
|
||||
" print(response.json())\n",
|
||||
" return []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a74da0a5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"map_times_no_cache = []\n",
|
||||
"map_times_cached = []\n",
|
||||
"map_url_counts_no_cache = []\n",
|
||||
"map_url_counts_cached = []\n",
|
||||
"\n",
|
||||
"for i, url in enumerate(crawl_urls):\n",
|
||||
" print(f\"Testing {i+1}/{len(crawl_urls)}: {url}\")\n",
|
||||
" \n",
|
||||
" # No index (ignoreIndex=True)\n",
|
||||
" start = datetime.now()\n",
|
||||
" links_no_index = map_request(url, True)\n",
|
||||
" time_no_index = (datetime.now() - start).total_seconds()\n",
|
||||
" \n",
|
||||
" map_times_no_cache.append(time_no_index)\n",
|
||||
" map_url_counts_no_cache.append(len(links_no_index))\n",
|
||||
" print(f\" No index: {time_no_index:.2f}s, {len(links_no_index)} URLs\")\n",
|
||||
" \n",
|
||||
" # With index (ignoreIndex=False)\n",
|
||||
" start = datetime.now()\n",
|
||||
" links_indexed = map_request(url, False)\n",
|
||||
" time_indexed = (datetime.now() - start).total_seconds()\n",
|
||||
" \n",
|
||||
" map_times_cached.append(time_indexed)\n",
|
||||
" map_url_counts_cached.append(len(links_indexed))\n",
|
||||
" print(f\" With index: {time_indexed:.2f}s, {len(links_indexed)} URLs\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2fa88f5d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Calculate averages\n",
|
||||
"avg_time_no_cache = statistics.mean(map_times_no_cache)\n",
|
||||
"avg_time_cached = statistics.mean(map_times_cached)\n",
|
||||
"avg_urls_no_cache = statistics.mean(map_url_counts_no_cache)\n",
|
||||
"avg_urls_cached = statistics.mean(map_url_counts_cached)\n",
|
||||
"\n",
|
||||
"time_speedup = avg_time_no_cache / avg_time_cached if avg_time_cached > 0 else 0\n",
|
||||
"url_difference = avg_urls_cached - avg_urls_no_cache\n",
|
||||
"url_percentage = (avg_urls_cached / avg_urls_no_cache * 100) if avg_urls_no_cache > 0 else 0\n",
|
||||
"\n",
|
||||
"print(\"MAP RESULTS:\")\n",
|
||||
"print(f\"Average time (no cache): {avg_time_no_cache:.2f}s\")\n",
|
||||
"print(f\"Average time (cached): {avg_time_cached:.2f}s\")\n",
|
||||
"print(f\"Time speedup: {time_speedup:.2f}x faster with cache\")\n",
|
||||
"print(f\"Average URLs found (no cache): {avg_urls_no_cache:.1f}\")\n",
|
||||
"print(f\"Average URLs found (cached): {avg_urls_cached:.1f}\")\n",
|
||||
"print(f\"URL difference: {url_difference:+.1f} URLs with cache\")\n",
|
||||
"print(f\"URL percentage: {url_percentage:.1f}% of no-cache results\")\n",
|
||||
"\n",
|
||||
"if url_difference > 0:\n",
|
||||
" print(\"✅ Cache finds MORE URLs\")\n",
|
||||
"elif url_difference < 0:\n",
|
||||
" print(\"⚠️ Cache finds FEWER URLs\")\n",
|
||||
"else:\n",
|
||||
" print(\"➡️ Cache finds SAME number of URLs\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e5ee2116",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user