crawl4ai/tests/async/test_edge_cases.py

import os
import re
import sys
import pytest
import json
from bs4 import BeautifulSoup
import asyncio
# Add the parent directory to the Python path
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)

from crawl4ai.async_webcrawler import AsyncWebCrawler

# @pytest.mark.asyncio
# async def test_large_content_page():
#     async with AsyncWebCrawler(verbose=True) as crawler:
#         url = "https://en.wikipedia.org/wiki/List_of_largest_known_stars"  # A page with a large table
#         result = await crawler.arun(url=url, bypass_cache=True)
#         assert result.success
#         assert len(result.html) > 1000000  # Expecting more than 1MB of content

# @pytest.mark.asyncio
# async def test_minimal_content_page():
#     async with AsyncWebCrawler(verbose=True) as crawler:
#         url = "https://example.com"  # A very simple page
#         result = await crawler.arun(url=url, bypass_cache=True)
#         assert result.success
#         assert len(result.html) < 10000  # Expecting less than 10KB of content

# @pytest.mark.asyncio
# async def test_single_page_application():
#     async with AsyncWebCrawler(verbose=True) as crawler:
#         url = "https://reactjs.org/"  # React's website is a SPA
#         result = await crawler.arun(url=url, bypass_cache=True)
#         assert result.success
#         assert "react" in result.html.lower()

# @pytest.mark.asyncio
# async def test_page_with_infinite_scroll():
#     async with AsyncWebCrawler(verbose=True) as crawler:
#         url = "https://news.ycombinator.com/"  # Hacker News has infinite scroll
#         result = await crawler.arun(url=url, bypass_cache=True)
#         assert result.success
#         assert "hacker news" in result.html.lower()

# @pytest.mark.asyncio
# async def test_page_with_heavy_javascript():
#     async with AsyncWebCrawler(verbose=True) as crawler:
#         url = "https://www.airbnb.com/"  # Airbnb uses a lot of JavaScript
#         result = await crawler.arun(url=url, bypass_cache=True)
#         assert result.success
#         assert "airbnb" in result.html.lower()

# @pytest.mark.asyncio
# async def test_page_with_mixed_content():
#     async with AsyncWebCrawler(verbose=True) as crawler:
#         url = "https://github.com/"  # GitHub has a mix of static and dynamic content
#         result = await crawler.arun(url=url, bypass_cache=True)
#         assert result.success
#         assert "github" in result.html.lower()

# Add this test to your existing test file
@pytest.mark.asyncio
async def test_typescript_commits_multi_page():
    first_commit = ""
    async def on_execution_started(page):
        nonlocal first_commit 
        try:
            # Check if the page firct commit h4 text is different from the first commit (use document.querySelector('li.Box-sc-g0xbh4-0 h4'))
            while True:
                await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')
                commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')
                commit = await commit.evaluate('(element) => element.textContent')
                commit = re.sub(r'\s+', '', commit)
                if commit and commit != first_commit:
                    first_commit = commit
                    break
                await asyncio.sleep(0.5)
        except Exception as e:
            print(f"Warning: New content didn't appear after JavaScript execution: {e}")


    async with AsyncWebCrawler(verbose=True) as crawler:
        crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)

        url = "https://github.com/microsoft/TypeScript/commits/main"
        session_id = "typescript_commits_session"
        all_commits = []

        js_next_page = """
        const button = document.querySelector('a[data-testid="pagination-next-button"]');
        if (button) button.click();
        """

        for page in range(3):  # Crawl 3 pages
            result = await crawler.arun(
                url=url,  # Only use URL for the first page
                session_id=session_id,
                css_selector="li.Box-sc-g0xbh4-0",
                js=js_next_page if page > 0 else None,  # Don't click 'next' on the first page
                bypass_cache=True,
                js_only=page > 0  # Use js_only for subsequent pages
            )

            assert result.success, f"Failed to crawl page {page + 1}"

            # Parse the HTML and extract commits
            soup = BeautifulSoup(result.cleaned_html, 'html.parser')
            commits = soup.select("li")
            # Take first commit find h4 extract text
            first_commit = commits[0].find("h4").text
            first_commit = re.sub(r'\s+', '', first_commit)
            all_commits.extend(commits)

            print(f"Page {page + 1}: Found {len(commits)} commits")

        # Clean up the session
        await crawler.crawler_strategy.kill_session(session_id)

        # Assertions
        assert len(all_commits) >= 90, f"Expected at least 90 commits, but got {len(all_commits)}"
        
        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")                      

# Entry point for debugging
if __name__ == "__main__":
    pytest.main([__file__, "-v"])
Add Async Version, JsonCss Extrator 2024-09-03 01:27:00 +08:00			`import os`
			`import re`
			`import sys`
			`import pytest`
			`import json`
			`from bs4 import BeautifulSoup`
			`import asyncio`
			`# Add the parent directory to the Python path`
			`parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))`
			`sys.path.append(parent_dir)`

			`from crawl4ai.async_webcrawler import AsyncWebCrawler`

			`# @pytest.mark.asyncio`
			`# async def test_large_content_page():`
			`# async with AsyncWebCrawler(verbose=True) as crawler:`
			`# url = "https://en.wikipedia.org/wiki/List_of_largest_known_stars" # A page with a large table`
			`# result = await crawler.arun(url=url, bypass_cache=True)`
			`# assert result.success`
			`# assert len(result.html) > 1000000 # Expecting more than 1MB of content`

			`# @pytest.mark.asyncio`
			`# async def test_minimal_content_page():`
			`# async with AsyncWebCrawler(verbose=True) as crawler:`
			`# url = "https://example.com" # A very simple page`
			`# result = await crawler.arun(url=url, bypass_cache=True)`
			`# assert result.success`
			`# assert len(result.html) < 10000 # Expecting less than 10KB of content`

			`# @pytest.mark.asyncio`
			`# async def test_single_page_application():`
			`# async with AsyncWebCrawler(verbose=True) as crawler:`
			`# url = "https://reactjs.org/" # React's website is a SPA`
			`# result = await crawler.arun(url=url, bypass_cache=True)`
			`# assert result.success`
			`# assert "react" in result.html.lower()`

			`# @pytest.mark.asyncio`
			`# async def test_page_with_infinite_scroll():`
			`# async with AsyncWebCrawler(verbose=True) as crawler:`
			`# url = "https://news.ycombinator.com/" # Hacker News has infinite scroll`
			`# result = await crawler.arun(url=url, bypass_cache=True)`
			`# assert result.success`
			`# assert "hacker news" in result.html.lower()`

			`# @pytest.mark.asyncio`
			`# async def test_page_with_heavy_javascript():`
			`# async with AsyncWebCrawler(verbose=True) as crawler:`
			`# url = "https://www.airbnb.com/" # Airbnb uses a lot of JavaScript`
			`# result = await crawler.arun(url=url, bypass_cache=True)`
			`# assert result.success`
			`# assert "airbnb" in result.html.lower()`

			`# @pytest.mark.asyncio`
			`# async def test_page_with_mixed_content():`
			`# async with AsyncWebCrawler(verbose=True) as crawler:`
			`# url = "https://github.com/" # GitHub has a mix of static and dynamic content`
			`# result = await crawler.arun(url=url, bypass_cache=True)`
			`# assert result.success`
			`# assert "github" in result.html.lower()`

			`# Add this test to your existing test file`
			`@pytest.mark.asyncio`
			`async def test_typescript_commits_multi_page():`
			`first_commit = ""`
			`async def on_execution_started(page):`
			`nonlocal first_commit`
			`try:`
			`# Check if the page firct commit h4 text is different from the first commit (use document.querySelector('li.Box-sc-g0xbh4-0 h4'))`
			`while True:`
			`await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')`
			`commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')`
			`commit = await commit.evaluate('(element) => element.textContent')`
			`commit = re.sub(r'\s+', '', commit)`
			`if commit and commit != first_commit:`
			`first_commit = commit`
			`break`
			`await asyncio.sleep(0.5)`
			`except Exception as e:`
			`print(f"Warning: New content didn't appear after JavaScript execution: {e}")`


			`async with AsyncWebCrawler(verbose=True) as crawler:`
			`crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)`

			`url = "https://github.com/microsoft/TypeScript/commits/main"`
			`session_id = "typescript_commits_session"`
			`all_commits = []`

			`js_next_page = """`
			`const button = document.querySelector('a[data-testid="pagination-next-button"]');`
			`if (button) button.click();`
			`"""`

			`for page in range(3): # Crawl 3 pages`
			`result = await crawler.arun(`
			`url=url, # Only use URL for the first page`
			`session_id=session_id,`
			`css_selector="li.Box-sc-g0xbh4-0",`
			`js=js_next_page if page > 0 else None, # Don't click 'next' on the first page`
			`bypass_cache=True,`
			`js_only=page > 0 # Use js_only for subsequent pages`
			`)`

			`assert result.success, f"Failed to crawl page {page + 1}"`

			`# Parse the HTML and extract commits`
			`soup = BeautifulSoup(result.cleaned_html, 'html.parser')`
			`commits = soup.select("li")`
			`# Take first commit find h4 extract text`
			`first_commit = commits[0].find("h4").text`
			`first_commit = re.sub(r'\s+', '', first_commit)`
			`all_commits.extend(commits)`

			`print(f"Page {page + 1}: Found {len(commits)} commits")`

			`# Clean up the session`
			`await crawler.crawler_strategy.kill_session(session_id)`

			`# Assertions`
			`assert len(all_commits) >= 90, f"Expected at least 90 commits, but got {len(all_commits)}"`

			`print(f"Successfully crawled {len(all_commits)} commits across 3 pages")`

			`# Entry point for debugging`
			`if __name__ == "__main__":`
			`pytest.main([__file__, "-v"])`