mirror of
https://github.com/unclecode/crawl4ai.git
synced 2025-09-30 13:10:12 +00:00
127 lines
5.2 KiB
Python
127 lines
5.2 KiB
Python
![]() |
import os
|
||
|
import re
|
||
|
import sys
|
||
|
import pytest
|
||
|
import json
|
||
|
from bs4 import BeautifulSoup
|
||
|
import asyncio
|
||
|
# Add the parent directory to the Python path
|
||
|
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
|
sys.path.append(parent_dir)
|
||
|
|
||
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||
|
|
||
|
# @pytest.mark.asyncio
|
||
|
# async def test_large_content_page():
|
||
|
# async with AsyncWebCrawler(verbose=True) as crawler:
|
||
|
# url = "https://en.wikipedia.org/wiki/List_of_largest_known_stars" # A page with a large table
|
||
|
# result = await crawler.arun(url=url, bypass_cache=True)
|
||
|
# assert result.success
|
||
|
# assert len(result.html) > 1000000 # Expecting more than 1MB of content
|
||
|
|
||
|
# @pytest.mark.asyncio
|
||
|
# async def test_minimal_content_page():
|
||
|
# async with AsyncWebCrawler(verbose=True) as crawler:
|
||
|
# url = "https://example.com" # A very simple page
|
||
|
# result = await crawler.arun(url=url, bypass_cache=True)
|
||
|
# assert result.success
|
||
|
# assert len(result.html) < 10000 # Expecting less than 10KB of content
|
||
|
|
||
|
# @pytest.mark.asyncio
|
||
|
# async def test_single_page_application():
|
||
|
# async with AsyncWebCrawler(verbose=True) as crawler:
|
||
|
# url = "https://reactjs.org/" # React's website is a SPA
|
||
|
# result = await crawler.arun(url=url, bypass_cache=True)
|
||
|
# assert result.success
|
||
|
# assert "react" in result.html.lower()
|
||
|
|
||
|
# @pytest.mark.asyncio
|
||
|
# async def test_page_with_infinite_scroll():
|
||
|
# async with AsyncWebCrawler(verbose=True) as crawler:
|
||
|
# url = "https://news.ycombinator.com/" # Hacker News has infinite scroll
|
||
|
# result = await crawler.arun(url=url, bypass_cache=True)
|
||
|
# assert result.success
|
||
|
# assert "hacker news" in result.html.lower()
|
||
|
|
||
|
# @pytest.mark.asyncio
|
||
|
# async def test_page_with_heavy_javascript():
|
||
|
# async with AsyncWebCrawler(verbose=True) as crawler:
|
||
|
# url = "https://www.airbnb.com/" # Airbnb uses a lot of JavaScript
|
||
|
# result = await crawler.arun(url=url, bypass_cache=True)
|
||
|
# assert result.success
|
||
|
# assert "airbnb" in result.html.lower()
|
||
|
|
||
|
# @pytest.mark.asyncio
|
||
|
# async def test_page_with_mixed_content():
|
||
|
# async with AsyncWebCrawler(verbose=True) as crawler:
|
||
|
# url = "https://github.com/" # GitHub has a mix of static and dynamic content
|
||
|
# result = await crawler.arun(url=url, bypass_cache=True)
|
||
|
# assert result.success
|
||
|
# assert "github" in result.html.lower()
|
||
|
|
||
|
# Add this test to your existing test file
|
||
|
@pytest.mark.asyncio
|
||
|
async def test_typescript_commits_multi_page():
|
||
|
first_commit = ""
|
||
|
async def on_execution_started(page):
|
||
|
nonlocal first_commit
|
||
|
try:
|
||
|
# Check if the page firct commit h4 text is different from the first commit (use document.querySelector('li.Box-sc-g0xbh4-0 h4'))
|
||
|
while True:
|
||
|
await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')
|
||
|
commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')
|
||
|
commit = await commit.evaluate('(element) => element.textContent')
|
||
|
commit = re.sub(r'\s+', '', commit)
|
||
|
if commit and commit != first_commit:
|
||
|
first_commit = commit
|
||
|
break
|
||
|
await asyncio.sleep(0.5)
|
||
|
except Exception as e:
|
||
|
print(f"Warning: New content didn't appear after JavaScript execution: {e}")
|
||
|
|
||
|
|
||
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||
|
crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)
|
||
|
|
||
|
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||
|
session_id = "typescript_commits_session"
|
||
|
all_commits = []
|
||
|
|
||
|
js_next_page = """
|
||
|
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
||
|
if (button) button.click();
|
||
|
"""
|
||
|
|
||
|
for page in range(3): # Crawl 3 pages
|
||
|
result = await crawler.arun(
|
||
|
url=url, # Only use URL for the first page
|
||
|
session_id=session_id,
|
||
|
css_selector="li.Box-sc-g0xbh4-0",
|
||
|
js=js_next_page if page > 0 else None, # Don't click 'next' on the first page
|
||
|
bypass_cache=True,
|
||
|
js_only=page > 0 # Use js_only for subsequent pages
|
||
|
)
|
||
|
|
||
|
assert result.success, f"Failed to crawl page {page + 1}"
|
||
|
|
||
|
# Parse the HTML and extract commits
|
||
|
soup = BeautifulSoup(result.cleaned_html, 'html.parser')
|
||
|
commits = soup.select("li")
|
||
|
# Take first commit find h4 extract text
|
||
|
first_commit = commits[0].find("h4").text
|
||
|
first_commit = re.sub(r'\s+', '', first_commit)
|
||
|
all_commits.extend(commits)
|
||
|
|
||
|
print(f"Page {page + 1}: Found {len(commits)} commits")
|
||
|
|
||
|
# Clean up the session
|
||
|
await crawler.crawler_strategy.kill_session(session_id)
|
||
|
|
||
|
# Assertions
|
||
|
assert len(all_commits) >= 90, f"Expected at least 90 commits, but got {len(all_commits)}"
|
||
|
|
||
|
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||
|
|
||
|
# Entry point for debugging
|
||
|
if __name__ == "__main__":
|
||
|
pytest.main([__file__, "-v"])
|