{ "cells": [ { "cell_type": "markdown", "id": "51331c01", "metadata": {}, "source": [ "# Index Benchmark" ] }, { "cell_type": "markdown", "id": "9cb10752", "metadata": {}, "source": [] }, { "cell_type": "markdown", "id": "7928e2c9", "metadata": {}, "source": [ "---" ] }, { "cell_type": "code", "execution_count": null, "id": "a64ec8a0", "metadata": {}, "outputs": [], "source": [ "from firecrawl import FirecrawlApp, ScrapeOptions\n", "import os\n", "from dotenv import load_dotenv\n", "from datetime import datetime\n", "import statistics\n", "import requests\n", "from time import sleep\n", "\n", "load_dotenv()" ] }, { "cell_type": "code", "execution_count": null, "id": "bc7ce797", "metadata": {}, "outputs": [], "source": [ "app = FirecrawlApp(api_key=os.getenv(\"FIRECRAWL_API_KEY\"))" ] }, { "cell_type": "code", "execution_count": null, "id": "440f7c2d", "metadata": {}, "outputs": [], "source": [ "scrape_urls=[\n", " 'https://news.ycombinator.com', # - Hacker News (simple, fast-loading)\n", " 'https://httpbin.org', # - HTTP testing service (very reliable)\n", " 'https://example.com', # - Standard test domain (minimal content)\n", " 'https://github.com/microsoft/vscode', # - GitHub repo page (structured content)\n", " 'https://stackoverflow.com/questions', # - Stack Overflow questions page\n", " 'https://www.wikipedia.org', # - Wikipedia main page (rich content)\n", " 'https://jsonplaceholder.typicode.com', # - Fake API for testing\n", " 'https://httpstat.us/200', # - HTTP status testing (minimal response)\n", " 'https://docs.python.org/3/' # - Python documentation (structured docs)\n", "]\n", "\n", "\n", "crawl_urls = [\n", " \"https://www.pcbgogo.com\", # 7825\n", " \"https://github.com/Uniswap/v4-core\", # 7353\n", " \"https://www.arcep.fr/actualites\", # 9764\n", " \"https://www.synapticure.com\", # 7746\n", " \"https://www.elecrow.com\", # 8025\n", " \"https://www.idfcfirstbank.com\", # 9912\n", " \"https://www.todaytix.com\", # 7532\n", " \"https://www.wheel-size.com\", # 7102\n", " \"https://drymerge.com\", # 8422\n", " \"https://telegramindex.org\" # 5335\n", "]" ] }, { "cell_type": "markdown", "id": "e54e6677", "metadata": {}, "source": [ "## Scrape" ] }, { "cell_type": "markdown", "id": "3fed4cb6", "metadata": {}, "source": [ "Hypothesis: Indexed scrapes are faster" ] }, { "cell_type": "code", "execution_count": null, "id": "fb052d01", "metadata": {}, "outputs": [], "source": [ "scrape_times_no_cache = []\n", "scrape_times_cached = []\n", "\n", "for i, url in enumerate(scrape_urls): # Test first 5 URLs\n", " print(f\"Testing {i+1}/{len(scrape_urls)}: {url}\")\n", " \n", " # No cache (maxAge=1)\n", " try:\n", " start = datetime.now()\n", " doc = app.scrape_url(url, maxAge=1)\n", " no_cache_time = (datetime.now() - start).total_seconds()\n", " scrape_times_no_cache.append(no_cache_time)\n", " print(f\" No cache: {no_cache_time:.2f}s ({doc.metadata['scrapeId']})\")\n", " except Exception as e:\n", " print(f\" No cache: FAILED - {e}\")\n", " scrape_times_no_cache.append(None)\n", " \n", " print(\" Waiting for cache to propagate...\")\n", " sleep(17)\n", " \n", " # Cached (maxAge=100000)\n", " try:\n", " start = datetime.now()\n", " doc = app.scrape_url(url, maxAge=100000)\n", " cached_time = (datetime.now() - start).total_seconds()\n", " scrape_times_cached.append(cached_time)\n", " print(f\" Cached: {cached_time:.2f}s ({doc.metadata['scrapeId']})\")\n", " except Exception as e:\n", " print(f\" Cached: FAILED - {e}\")\n", " scrape_times_cached.append(None)" ] }, { "cell_type": "code", "execution_count": null, "id": "7dce8a83", "metadata": {}, "outputs": [], "source": [ "# Calculate averages\n", "valid_no_cache = [t for t in scrape_times_no_cache if t is not None]\n", "valid_cached = [t for t in scrape_times_cached if t is not None]\n", "\n", "if valid_no_cache and valid_cached:\n", " avg_no_cache = statistics.mean(valid_no_cache)\n", " avg_cached = statistics.mean(valid_cached)\n", " speedup = avg_no_cache / avg_cached if avg_cached > 0 else 0\n", " \n", " print(\"SCRAPE RESULTS:\")\n", " print(f\"Average no cache: {avg_no_cache:.2f}s\")\n", " print(f\"Average cached: {avg_cached:.2f}s\")\n", " print(f\"Speedup: {speedup:.1f}x faster with cache\")\n", " print(f\"Time saved: {avg_no_cache - avg_cached:.2f}s per request\")" ] }, { "cell_type": "markdown", "id": "cc682ba4", "metadata": {}, "source": [ "---" ] }, { "cell_type": "markdown", "id": "df801504", "metadata": {}, "source": [ "## Crawl" ] }, { "cell_type": "markdown", "id": "658374d7", "metadata": {}, "source": [ "--- for now used to improve map " ] }, { "cell_type": "markdown", "id": "5628c39d", "metadata": {}, "source": [ "Hypothesis: Indexed crawls are faster" ] }, { "cell_type": "code", "execution_count": null, "id": "1482e163", "metadata": {}, "outputs": [], "source": [ "crawl_times_no_cache = []\n", "crawl_times_cached = []\n", "\n", "for i, url in enumerate(crawl_urls):\n", " try:\n", " print(f\"Crawling {i+1}/{len(crawl_urls)}: {url}\")\n", " result = app.crawl_url(url)\n", " except Exception as e:\n", " print(f\"{url[0]} - Crawl FAILED - {e}\")" ] }, { "cell_type": "markdown", "id": "7eb27685", "metadata": {}, "source": [ "---" ] }, { "cell_type": "markdown", "id": "abe3f30e", "metadata": {}, "source": [ "## Map" ] }, { "cell_type": "markdown", "id": "683c74da", "metadata": {}, "source": [ "Hypothesis: Indexed Map should get more urls after crawl" ] }, { "cell_type": "code", "execution_count": null, "id": "f8d79207", "metadata": {}, "outputs": [], "source": [ "def map_request(url, ignore_index):\n", " \"\"\"\n", " Make a map request and return the links\n", " \"\"\"\n", " payload = {\"url\": url, \"useIndex\": not ignore_index, \"limit\": 30000}\n", " headers = {'Content-Type': 'application/json', \"Authorization\": \"Bearer no-auth\"}\n", " response = requests.post(\"https://api.firecrawl.dev/v1/map\", headers=headers, json=payload)\n", " \n", " if response.status_code == 200:\n", " data = response.json()\n", " return data.get('links', [])\n", " else:\n", " print(response.json())\n", " return []" ] }, { "cell_type": "code", "execution_count": null, "id": "a74da0a5", "metadata": {}, "outputs": [], "source": [ "map_times_no_cache = []\n", "map_times_cached = []\n", "map_url_counts_no_cache = []\n", "map_url_counts_cached = []\n", "\n", "for i, url in enumerate(crawl_urls):\n", " print(f\"Testing {i+1}/{len(crawl_urls)}: {url}\")\n", " \n", " # No index (ignoreIndex=True)\n", " start = datetime.now()\n", " links_no_index = map_request(url, True)\n", " time_no_index = (datetime.now() - start).total_seconds()\n", " \n", " map_times_no_cache.append(time_no_index)\n", " map_url_counts_no_cache.append(len(links_no_index))\n", " print(f\" No index: {time_no_index:.2f}s, {len(links_no_index)} URLs\")\n", " \n", " # With index (ignoreIndex=False)\n", " start = datetime.now()\n", " links_indexed = map_request(url, False)\n", " time_indexed = (datetime.now() - start).total_seconds()\n", " \n", " map_times_cached.append(time_indexed)\n", " map_url_counts_cached.append(len(links_indexed))\n", " print(f\" With index: {time_indexed:.2f}s, {len(links_indexed)} URLs\")" ] }, { "cell_type": "code", "execution_count": null, "id": "2fa88f5d", "metadata": {}, "outputs": [], "source": [ "# Calculate averages\n", "avg_time_no_cache = statistics.mean(map_times_no_cache)\n", "avg_time_cached = statistics.mean(map_times_cached)\n", "avg_urls_no_cache = statistics.mean(map_url_counts_no_cache)\n", "avg_urls_cached = statistics.mean(map_url_counts_cached)\n", "\n", "time_speedup = avg_time_no_cache / avg_time_cached if avg_time_cached > 0 else 0\n", "url_difference = avg_urls_cached - avg_urls_no_cache\n", "url_percentage = (avg_urls_cached / avg_urls_no_cache * 100) if avg_urls_no_cache > 0 else 0\n", "\n", "print(\"MAP RESULTS:\")\n", "print(f\"Average time (no cache): {avg_time_no_cache:.2f}s\")\n", "print(f\"Average time (cached): {avg_time_cached:.2f}s\")\n", "print(f\"Time speedup: {time_speedup:.2f}x faster with cache\")\n", "print(f\"Average URLs found (no cache): {avg_urls_no_cache:.1f}\")\n", "print(f\"Average URLs found (cached): {avg_urls_cached:.1f}\")\n", "print(f\"URL difference: {url_difference:+.1f} URLs with cache\")\n", "print(f\"URL percentage: {url_percentage:.1f}% of no-cache results\")\n", "\n", "if url_difference > 0:\n", " print(\"✅ Cache finds MORE URLs\")\n", "elif url_difference < 0:\n", " print(\"⚠️ Cache finds FEWER URLs\")\n", "else:\n", " print(\"➡️ Cache finds SAME number of URLs\")" ] }, { "cell_type": "markdown", "id": "e5ee2116", "metadata": {}, "source": [ "---" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 5 }