mirror of
https://github.com/unclecode/crawl4ai.git
synced 2025-09-27 03:38:44 +00:00
134 lines
5.2 KiB
Python
134 lines
5.2 KiB
Python
import os
|
|
import re
|
|
import sys
|
|
import pytest
|
|
from bs4 import BeautifulSoup
|
|
import asyncio
|
|
|
|
# Add the parent directory to the Python path
|
|
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
sys.path.append(parent_dir)
|
|
|
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
|
|
|
# @pytest.mark.asyncio
|
|
# async def test_large_content_page():
|
|
# async with AsyncWebCrawler(verbose=True) as crawler:
|
|
# url = "https://en.wikipedia.org/wiki/List_of_largest_known_stars" # A page with a large table
|
|
# result = await crawler.arun(url=url, bypass_cache=True)
|
|
# assert result.success
|
|
# assert len(result.html) > 1000000 # Expecting more than 1MB of content
|
|
|
|
# @pytest.mark.asyncio
|
|
# async def test_minimal_content_page():
|
|
# async with AsyncWebCrawler(verbose=True) as crawler:
|
|
# url = "https://example.com" # A very simple page
|
|
# result = await crawler.arun(url=url, bypass_cache=True)
|
|
# assert result.success
|
|
# assert len(result.html) < 10000 # Expecting less than 10KB of content
|
|
|
|
# @pytest.mark.asyncio
|
|
# async def test_single_page_application():
|
|
# async with AsyncWebCrawler(verbose=True) as crawler:
|
|
# url = "https://reactjs.org/" # React's website is a SPA
|
|
# result = await crawler.arun(url=url, bypass_cache=True)
|
|
# assert result.success
|
|
# assert "react" in result.html.lower()
|
|
|
|
# @pytest.mark.asyncio
|
|
# async def test_page_with_infinite_scroll():
|
|
# async with AsyncWebCrawler(verbose=True) as crawler:
|
|
# url = "https://news.ycombinator.com/" # Hacker News has infinite scroll
|
|
# result = await crawler.arun(url=url, bypass_cache=True)
|
|
# assert result.success
|
|
# assert "hacker news" in result.html.lower()
|
|
|
|
# @pytest.mark.asyncio
|
|
# async def test_page_with_heavy_javascript():
|
|
# async with AsyncWebCrawler(verbose=True) as crawler:
|
|
# url = "https://www.airbnb.com/" # Airbnb uses a lot of JavaScript
|
|
# result = await crawler.arun(url=url, bypass_cache=True)
|
|
# assert result.success
|
|
# assert "airbnb" in result.html.lower()
|
|
|
|
# @pytest.mark.asyncio
|
|
# async def test_page_with_mixed_content():
|
|
# async with AsyncWebCrawler(verbose=True) as crawler:
|
|
# url = "https://github.com/" # GitHub has a mix of static and dynamic content
|
|
# result = await crawler.arun(url=url, bypass_cache=True)
|
|
# assert result.success
|
|
# assert "github" in result.html.lower()
|
|
|
|
|
|
# Add this test to your existing test file
|
|
@pytest.mark.asyncio
|
|
async def test_typescript_commits_multi_page():
|
|
first_commit = ""
|
|
|
|
async def on_execution_started(page):
|
|
nonlocal first_commit
|
|
try:
|
|
# Check if the page firct commit h4 text is different from the first commit (use document.querySelector('li.Box-sc-g0xbh4-0 h4'))
|
|
while True:
|
|
await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
|
|
commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
|
|
commit = await commit.evaluate("(element) => element.textContent")
|
|
commit = re.sub(r"\s+", "", commit)
|
|
if commit and commit != first_commit:
|
|
first_commit = commit
|
|
break
|
|
await asyncio.sleep(0.5)
|
|
except Exception as e:
|
|
print(f"Warning: New content didn't appear after JavaScript execution: {e}")
|
|
|
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
|
|
|
|
url = "https://github.com/microsoft/TypeScript/commits/main"
|
|
session_id = "typescript_commits_session"
|
|
all_commits = []
|
|
|
|
js_next_page = """
|
|
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
|
if (button) button.click();
|
|
"""
|
|
|
|
for page in range(3): # Crawl 3 pages
|
|
result = await crawler.arun(
|
|
url=url, # Only use URL for the first page
|
|
session_id=session_id,
|
|
css_selector="li.Box-sc-g0xbh4-0",
|
|
js=js_next_page
|
|
if page > 0
|
|
else None, # Don't click 'next' on the first page
|
|
bypass_cache=True,
|
|
js_only=page > 0, # Use js_only for subsequent pages
|
|
)
|
|
|
|
assert result.success, f"Failed to crawl page {page + 1}"
|
|
|
|
# Parse the HTML and extract commits
|
|
soup = BeautifulSoup(result.cleaned_html, "html.parser")
|
|
commits = soup.select("li")
|
|
# Take first commit find h4 extract text
|
|
first_commit = commits[0].find("h4").text
|
|
first_commit = re.sub(r"\s+", "", first_commit)
|
|
all_commits.extend(commits)
|
|
|
|
print(f"Page {page + 1}: Found {len(commits)} commits")
|
|
|
|
# Clean up the session
|
|
await crawler.crawler_strategy.kill_session(session_id)
|
|
|
|
# Assertions
|
|
assert (
|
|
len(all_commits) >= 90
|
|
), f"Expected at least 90 commits, but got {len(all_commits)}"
|
|
|
|
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
|
|
|
|
|
# Entry point for debugging
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|