crawl4ai/docs/examples/crawlai_vs_firecrawl.py
2025-01-13 19:19:58 +08:00

71 lines
2.7 KiB
Python

import os, time
# append the path to the root of the project
import sys
import asyncio
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from firecrawl import FirecrawlApp
from crawl4ai import AsyncWebCrawler
__data__ = os.path.join(os.path.dirname(__file__), "..", "..") + "/.data"
async def compare():
app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
# Tet Firecrawl with a simple crawl
start = time.time()
scrape_status = app.scrape_url(
"https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
)
end = time.time()
print(f"Time taken: {end - start} seconds")
print(len(scrape_status["markdown"]))
# save the markdown content with provider name
with open(f"{__data__}/firecrawl_simple.md", "w") as f:
f.write(scrape_status["markdown"])
# Count how many "cldnry.s-nbcnews.com" are in the markdown
print(scrape_status["markdown"].count("cldnry.s-nbcnews.com"))
async with AsyncWebCrawler() as crawler:
start = time.time()
result = await crawler.arun(
url="https://www.nbcnews.com/business",
# js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
word_count_threshold=0,
bypass_cache=True,
verbose=False,
)
end = time.time()
print(f"Time taken: {end - start} seconds")
print(len(result.markdown))
# save the markdown content with provider name
with open(f"{__data__}/crawl4ai_simple.md", "w") as f:
f.write(result.markdown)
# count how many "cldnry.s-nbcnews.com" are in the markdown
print(result.markdown.count("cldnry.s-nbcnews.com"))
start = time.time()
result = await crawler.arun(
url="https://www.nbcnews.com/business",
js_code=[
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
],
word_count_threshold=0,
bypass_cache=True,
verbose=False,
)
end = time.time()
print(f"Time taken: {end - start} seconds")
print(len(result.markdown))
# save the markdown content with provider name
with open(f"{__data__}/crawl4ai_js.md", "w") as f:
f.write(result.markdown)
# count how many "cldnry.s-nbcnews.com" are in the markdown
print(result.markdown.count("cldnry.s-nbcnews.com"))
if __name__ == "__main__":
asyncio.run(compare())