Create series of quickstart files.

2025-12-30 03:48:54 +00:00 · 2024-09-04 15:33:24 +08:00 · 2024-09-04 15:33:24 +08:00 · eb131bebdf
commit eb131bebdf
parent 5c15837677
4 changed files with 977 additions and 441 deletions
--- a/README.md
+++ b/README.md
@ -12,7 +12,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc

 ## Try it Now!

-✨ Play around with this [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1sJPAmeLj5PMrg2VgOwMJ2ubGIcK0cJeX?usp=sharing)
+✨ Play around with this [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1REChY6fXQf-EaVYLv0eHEWvzlYxGm0pd?usp=sharing)

 ✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/)

--- a/docs/examples/quickstart.ipynb
+++ b/docs/examples/quickstart.ipynb
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@ -0,0 +1,242 @@
+import asyncio
+import time
+import json
+import os
+import re
+from bs4 import BeautifulSoup
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
+
+print("Crawl4AI: Advanced Web Crawling and Data Extraction")
+print("GitHub Repository: https://github.com/unclecode/crawl4ai")
+print("Twitter: @unclecode")
+print("Website: https://crawl4ai.com")
+
+async def simple_crawl():
+    print("\n--- Basic Usage ---")
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(url="https://www.nbcnews.com/business")
+        print(result.markdown[:500])  # Print first 500 characters
+
+async def js_and_css():
+    print("\n--- Executing JavaScript and Using CSS Selectors ---")
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        js_code = ["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"]
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            js_code=js_code,
+            css_selector="article.tease-card",
+            bypass_cache=True
+        )
+        print(result.extracted_content[:500])  # Print first 500 characters
+
+async def use_proxy():
+    print("\n--- Using a Proxy ---")
+    print("Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example.")
+    # Uncomment and modify the following lines to use a proxy
+    # async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler:
+    #     result = await crawler.arun(
+    #         url="https://www.nbcnews.com/business",
+    #         bypass_cache=True
+    #     )
+    #     print(result.markdown[:500])  # Print first 500 characters
+
+class OpenAIModelFee(BaseModel):
+    model_name: str = Field(..., description="Name of the OpenAI model.")
+    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
+    output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
+
+async def extract_openai_fees():
+    print("\n--- Extracting Structured Data with OpenAI ---")
+    print("Note: Set your OpenAI API key as an environment variable to run this example.")
+    if not os.getenv('OPENAI_API_KEY'):
+        print("OpenAI API key not found. Skipping this example.")
+        return
+    
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url='https://openai.com/api/pricing/',
+            word_count_threshold=1,
+            extraction_strategy=LLMExtractionStrategy(
+                provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'), 
+                schema=OpenAIModelFee.schema(),
+                extraction_type="schema",
+                instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
+                Do not miss any models in the entire content. One extracted model JSON format should look like this: 
+                {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}."""
+            ),            
+            bypass_cache=True,
+        )
+        print(result.extracted_content)
+
+async def crawl_typescript_commits():
+    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+    first_commit = ""
+    async def on_execution_started(page):
+        nonlocal first_commit 
+        try:
+            while True:
+                await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')
+                commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')
+                commit = await commit.evaluate('(element) => element.textContent')
+                commit = re.sub(r'\s+', '', commit)
+                if commit and commit != first_commit:
+                    first_commit = commit
+                    break
+                await asyncio.sleep(0.5)
+        except Exception as e:
+            print(f"Warning: New content didn't appear after JavaScript execution: {e}")
+
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)
+
+        url = "https://github.com/microsoft/TypeScript/commits/main"
+        session_id = "typescript_commits_session"
+        all_commits = []
+
+        js_next_page = """
+        const button = document.querySelector('a[data-testid="pagination-next-button"]');
+        if (button) button.click();
+        """
+
+        for page in range(3):  # Crawl 3 pages
+            result = await crawler.arun(
+                url=url,
+                session_id=session_id,
+                css_selector="li.Box-sc-g0xbh4-0",
+                js=js_next_page if page > 0 else None,
+                bypass_cache=True,
+                js_only=page > 0
+            )
+
+            assert result.success, f"Failed to crawl page {page + 1}"
+
+            soup = BeautifulSoup(result.cleaned_html, 'html.parser')
+            commits = soup.select("li")
+            all_commits.extend(commits)
+
+            print(f"Page {page + 1}: Found {len(commits)} commits")
+
+        await crawler.crawler_strategy.kill_session(session_id)
+        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+async def extract_news_teasers():
+    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
+    schema = {
+        "name": "News Teaser Extractor",
+        "baseSelector": ".wide-tease-item__wrapper",
+        "fields": [
+            {
+                "name": "category",
+                "selector": ".unibrow span[data-testid='unibrow-text']",
+                "type": "text",
+            },
+            {
+                "name": "headline",
+                "selector": ".wide-tease-item__headline",
+                "type": "text",
+            },
+            {
+                "name": "summary",
+                "selector": ".wide-tease-item__description",
+                "type": "text",
+            },
+            {
+                "name": "time",
+                "selector": "[data-testid='wide-tease-date']",
+                "type": "text",
+            },
+            {
+                "name": "image",
+                "type": "nested",
+                "selector": "picture.teasePicture img",
+                "fields": [
+                    {"name": "src", "type": "attribute", "attribute": "src"},
+                    {"name": "alt", "type": "attribute", "attribute": "alt"},
+                ],
+            },
+            {
+                "name": "link",
+                "selector": "a[href]",
+                "type": "attribute",
+                "attribute": "href",
+            },
+        ],
+    }
+
+    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
+
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            extraction_strategy=extraction_strategy,
+            bypass_cache=True,
+        )
+
+        assert result.success, "Failed to crawl the page"
+
+        news_teasers = json.loads(result.extracted_content)
+        print(f"Successfully extracted {len(news_teasers)} news teasers")
+        print(json.dumps(news_teasers[0], indent=2))
+
+async def speed_comparison():
+    print("\n--- Speed Comparison ---")
+    print("Firecrawl (simulated):")
+    print("Time taken: 7.02 seconds")
+    print("Content length: 42074 characters")
+    print("Images found: 49")
+    print()
+
+    async with AsyncWebCrawler() as crawler:
+        # Crawl4AI simple crawl
+        start = time.time()
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            word_count_threshold=0,
+            bypass_cache=True, 
+            verbose=False
+        )
+        end = time.time()
+        print("Crawl4AI (simple crawl):")
+        print(f"Time taken: {end - start:.2f} seconds")
+        print(f"Content length: {len(result.markdown)} characters")
+        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
+        print()
+
+        # Crawl4AI with JavaScript execution
+        start = time.time()
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
+            word_count_threshold=0,
+            bypass_cache=True, 
+            verbose=False
+        )
+        end = time.time()
+        print("Crawl4AI (with JavaScript execution):")
+        print(f"Time taken: {end - start:.2f} seconds")
+        print(f"Content length: {len(result.markdown)} characters")
+        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
+
+    print("\nNote on Speed Comparison:")
+    print("The speed test conducted here may not reflect optimal conditions.")
+    print("When we call Firecrawl's API, we're seeing its best performance,")
+    print("while Crawl4AI's performance is limited by the local network speed.")
+    print("For a more accurate comparison, it's recommended to run these tests")
+    print("on servers with a stable and fast internet connection.")
+    print("Despite these limitations, Crawl4AI still demonstrates faster performance.")
+    print("If you run these tests in an environment with better network conditions,")
+    print("you may observe an even more significant speed advantage for Crawl4AI.")
+
+async def main():
+    await simple_crawl()
+    await js_and_css()
+    await use_proxy()
+    await extract_openai_fees()
+    await crawl_typescript_commits()
+    await extract_news_teasers()
+    await speed_comparison()
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/quickstart_sync.py
+++ b/docs/examples/quickstart_sync.py