crawl4ai/main.py

import os
import importlib
import asyncio
from functools import lru_cache
import logging
logging.basicConfig(level=logging.DEBUG)

from fastapi import FastAPI, HTTPException, Request
from fastapi.responses import HTMLResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware  
from fastapi.templating import Jinja2Templates
from fastapi.exceptions import RequestValidationError
from starlette.middleware.base import BaseHTTPMiddleware
from starlette.responses import FileResponse
from fastapi.responses import RedirectResponse

from pydantic import BaseModel, HttpUrl
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Optional

from crawl4ai.web_crawler import WebCrawler
from crawl4ai.database import get_total_count, clear_db

# Configuration
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
MAX_CONCURRENT_REQUESTS = 10  # Adjust this to change the maximum concurrent requests
current_requests = 0
lock = asyncio.Lock()

app = FastAPI()

# CORS configuration
origins = ["*"]  # Allow all origins
app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,  # List of origins that are allowed to make requests
    allow_credentials=True,
    allow_methods=["*"],  # Allows all methods
    allow_headers=["*"],  # Allows all headers
)

# Mount the pages directory as a static directory
app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages")
app.mount("/mkdocs", StaticFiles(directory="site", html=True), name="mkdocs")
site_templates = Jinja2Templates(directory=__location__ + "/site")
templates = Jinja2Templates(directory=__location__ + "/pages")

@lru_cache()
def get_crawler():
    # Initialize and return a WebCrawler instance
    crawler = WebCrawler(verbose = True)
    crawler.warmup()
    return crawler

class CrawlRequest(BaseModel):
    urls: List[str]
    include_raw_html: Optional[bool] = False
    bypass_cache: bool = False
    extract_blocks: bool = True
    word_count_threshold: Optional[int] = 5
    extraction_strategy: Optional[str] = "NoExtractionStrategy"
    extraction_strategy_args: Optional[dict] = {}
    chunking_strategy: Optional[str] = "RegexChunking"
    chunking_strategy_args: Optional[dict] = {}
    css_selector: Optional[str] = None
    screenshot: Optional[bool] = False
    user_agent: Optional[str] = None
    verbose: Optional[bool] = True

@app.get("/")
def read_root():
    return RedirectResponse(url="/mkdocs")

@app.get("/old", response_class=HTMLResponse)
async def read_index(request: Request):
    partials_dir = os.path.join(__location__, "pages", "partial")
    partials = {}

    for filename in os.listdir(partials_dir):
        if filename.endswith(".html"):
            with open(os.path.join(partials_dir, filename), "r", encoding="utf8") as file:
                partials[filename[:-5]] = file.read()

    return templates.TemplateResponse("index.html", {"request": request, **partials})

@app.get("/total-count")
async def get_total_url_count():
    count = get_total_count()
    return JSONResponse(content={"count": count})

@app.get("/clear-db")
async def clear_database():
    # clear_db()
    return JSONResponse(content={"message": "Database cleared."})

def import_strategy(module_name: str, class_name: str, *args, **kwargs):
    try:
        module = importlib.import_module(module_name)
        strategy_class = getattr(module, class_name)
        return strategy_class(*args, **kwargs)
    except ImportError:
        print("ImportError: Module not found.")
        raise HTTPException(status_code=400, detail=f"Module {module_name} not found.")
    except AttributeError:
        print("AttributeError: Class not found.")
        raise HTTPException(status_code=400, detail=f"Class {class_name} not found in {module_name}.")

@app.post("/crawl")
async def crawl_urls(crawl_request: CrawlRequest, request: Request):
    logging.debug(f"[LOG] Crawl request for URL: {crawl_request.urls}")
    global current_requests
    async with lock:
        if current_requests >= MAX_CONCURRENT_REQUESTS:
            raise HTTPException(status_code=429, detail="Too many requests - please try again later.")
        current_requests += 1

    try:
        logging.debug("[LOG] Loading extraction and chunking strategies...")
        crawl_request.extraction_strategy_args['verbose'] = True
        crawl_request.chunking_strategy_args['verbose'] = True
        
        extraction_strategy = import_strategy("crawl4ai.extraction_strategy", crawl_request.extraction_strategy, **crawl_request.extraction_strategy_args)
        chunking_strategy = import_strategy("crawl4ai.chunking_strategy", crawl_request.chunking_strategy, **crawl_request.chunking_strategy_args)

        # Use ThreadPoolExecutor to run the synchronous WebCrawler in async manner
        logging.debug("[LOG] Running the WebCrawler...")
        with ThreadPoolExecutor() as executor:
            loop = asyncio.get_event_loop()
            futures = [
                loop.run_in_executor(
                    executor, 
                    get_crawler().run,
                    str(url),
                    crawl_request.word_count_threshold,
                    extraction_strategy,
                    chunking_strategy,
                    crawl_request.bypass_cache,
                    crawl_request.css_selector,
                    crawl_request.screenshot,
                    crawl_request.user_agent,
                    crawl_request.verbose
                )
                for url in crawl_request.urls
            ]
            results = await asyncio.gather(*futures)

        # if include_raw_html is False, remove the raw HTML content from the results
        if not crawl_request.include_raw_html:
            for result in results:
                result.html = None

        return {"results": [result.model_dump() for result in results]}
    finally:
        async with lock:
            current_requests -= 1
            
@app.get("/strategies/extraction", response_class=JSONResponse)
async def get_extraction_strategies():
    with open(f"{__location__}/docs/extraction_strategies.json", "r") as file:
        return JSONResponse(content=file.read())

@app.get("/strategies/chunking", response_class=JSONResponse)
async def get_chunking_strategies():
    with open(f"{__location__}/docs/chunking_strategies.json", "r") as file:
        return JSONResponse(content=file.read())


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8888)
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`import os`
			`import importlib`
			`import asyncio`
			`from functools import lru_cache`
chore: Update Dockerfile to install chromium-chromedriver and spacy library 2024-05-18 09:16:52 +00:00			`import logging`
			`logging.basicConfig(level=logging.DEBUG)`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00
Initial Commit 2024-05-09 19:10:25 +08:00			`from fastapi import FastAPI, HTTPException, Request`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`from fastapi.responses import HTMLResponse, JSONResponse`
Initial Commit 2024-05-09 19:10:25 +08:00			`from fastapi.staticfiles import StaticFiles`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`from fastapi.middleware.cors import CORSMiddleware`
Update: - Debug - Refactor code for new version 2024-05-16 17:31:44 +08:00			`from fastapi.templating import Jinja2Templates`
Fixed: - Redirect "/" to mkdocs 2024-06-22 20:54:32 +08:00			`from fastapi.exceptions import RequestValidationError`
			`from starlette.middleware.base import BaseHTTPMiddleware`
			`from starlette.responses import FileResponse`
			`from fastapi.responses import RedirectResponse`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00
Initial Commit 2024-05-09 19:10:25 +08:00			`from pydantic import BaseModel, HttpUrl`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`from concurrent.futures import ThreadPoolExecutor, as_completed`
Initial Commit 2024-05-09 19:10:25 +08:00			`from typing import List, Optional`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00
Change the project folder name from crawler to crawl4ai 2024-05-09 22:16:28 +08:00			`from crawl4ai.web_crawler import WebCrawler`
			`from crawl4ai.database import get_total_count, clear_db`
Initial Commit 2024-05-09 19:10:25 +08:00
			`# Configuration`
			`__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))`
			`MAX_CONCURRENT_REQUESTS = 10 # Adjust this to change the maximum concurrent requests`
			`current_requests = 0`
			`lock = asyncio.Lock()`

			`app = FastAPI()`

Add CORS middleware for allowing all origins to make requests 2024-05-10 12:27:40 +02:00			`# CORS configuration`
			`origins = ["*"] # Allow all origins`
			`app.add_middleware(`
			`CORSMiddleware,`
			`allow_origins=origins, # List of origins that are allowed to make requests`
			`allow_credentials=True,`
			`allow_methods=["*"], # Allows all methods`
			`allow_headers=["*"], # Allows all headers`
			`)`

Initial Commit 2024-05-09 19:10:25 +08:00			`# Mount the pages directory as a static directory`
			`app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages")`
ADD MKDocs 2024-06-21 17:56:54 +08:00			`app.mount("/mkdocs", StaticFiles(directory="site", html=True), name="mkdocs")`
Fix mounting the "/" to mkdocs site folder 2024-06-22 20:41:39 +08:00			`site_templates = Jinja2Templates(directory=__location__ + "/site")`
Update: - Debug - Refactor code for new version 2024-05-16 17:31:44 +08:00			`templates = Jinja2Templates(directory=__location__ + "/pages")`
Fixed: - Redirect "/" to mkdocs 2024-06-22 20:54:32 +08:00
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`@lru_cache()`
			`def get_crawler():`
			`# Initialize and return a WebCrawler instance`
Switch to ChromeDriverManager due some issues with download the chrome driver 2024-06-26 13:00:17 +08:00			`crawler = WebCrawler(verbose = True)`
			`crawler.warmup()`
			`return crawler`
Initial Commit 2024-05-09 19:10:25 +08:00
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`class CrawlRequest(BaseModel):`
chore: Update CrawlRequest model to use NoExtractionStrategy as default 2024-05-17 16:50:38 +08:00			`urls: List[str]`
Initial Commit 2024-05-09 19:10:25 +08:00			`include_raw_html: Optional[bool] = False`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`bypass_cache: bool = False`
Initial Commit 2024-05-09 19:10:25 +08:00			`extract_blocks: bool = True`
			`word_count_threshold: Optional[int] = 5`
chore: Update CrawlRequest model to use NoExtractionStrategy as default 2024-05-17 16:50:38 +08:00			`extraction_strategy: Optional[str] = "NoExtractionStrategy"`
Update: - Debug - Refactor code for new version 2024-05-16 17:31:44 +08:00			`extraction_strategy_args: Optional[dict] = {}`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`chunking_strategy: Optional[str] = "RegexChunking"`
Update: - Debug - Refactor code for new version 2024-05-16 17:31:44 +08:00			`chunking_strategy_args: Optional[dict] = {}`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`css_selector: Optional[str] = None`
feat: Add screenshot functionality to crawl_urls The code changes in this commit add the `screenshot` parameter to the `crawl_urls` function in `main.py`. This allows users to specify whether they want to take a screenshot of the page during the crawling process. The default value is `False`. This commit message follows the established convention of starting with a type (feat for feature) and providing a concise and descriptive summary of the changes made. 2024-06-07 15:23:32 +08:00			`screenshot: Optional[bool] = False`
- User agent - Extract Links - Extract Metadata - Update Readme - Update REST API document 2024-06-08 17:59:42 +08:00			`user_agent: Optional[str] = None`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`verbose: Optional[bool] = True`
Initial Commit 2024-05-09 19:10:25 +08:00
Fixed: - Redirect "/" to mkdocs 2024-06-22 20:54:32 +08:00			`@app.get("/")`
			`def read_root():`
			`return RedirectResponse(url="/mkdocs")`
chore: - Add demo page to the new mkdocs - Set website home page to mkdocs 2024-06-22 20:36:01 +08:00
			`@app.get("/old", response_class=HTMLResponse)`
Update: - Debug - Refactor code for new version 2024-05-16 17:31:44 +08:00			`async def read_index(request: Request):`
			`partials_dir = os.path.join(__location__, "pages", "partial")`
			`partials = {}`

			`for filename in os.listdir(partials_dir):`
			`if filename.endswith(".html"):`
fix(main): UnicodeDecodeError File "T:\_GitHubProjects\Forks\crawl4ai\main.py", line 70, in read_index partials[filename[:-5]] = file.read() UnicodeDecodeError: 'gbk' codec can't decode byte 0xa4 in position 149: illegal multibyte sequence 2024-05-18 23:31:11 +08:00			`with open(os.path.join(partials_dir, filename), "r", encoding="utf8") as file:`
Update: - Debug - Refactor code for new version 2024-05-16 17:31:44 +08:00			`partials[filename[:-5]] = file.read()`

			`return templates.TemplateResponse("index.html", {"request": request, **partials})`
Initial Commit 2024-05-09 19:10:25 +08:00
			`@app.get("/total-count")`
			`async def get_total_url_count():`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`count = get_total_count()`
Initial Commit 2024-05-09 19:10:25 +08:00			`return JSONResponse(content={"count": count})`

`chore: Add function to clear the database` 2024-05-09 19:42:43 +08:00			`@app.get("/clear-db")`
			`async def clear_database():`
chore: Update Dockerfile to install chromium-chromedriver and spacy library 2024-05-18 09:16:52 +00:00			`# clear_db()`
`chore: Add function to clear the database` 2024-05-09 19:42:43 +08:00			`return JSONResponse(content={"message": "Database cleared."})`

Update: - Debug - Refactor code for new version 2024-05-16 17:31:44 +08:00			`def import_strategy(module_name: str, class_name: str, args, *kwargs):`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`try:`
			`module = importlib.import_module(module_name)`
			`strategy_class = getattr(module, class_name)`
Update: - Debug - Refactor code for new version 2024-05-16 17:31:44 +08:00			`return strategy_class(args, *kwargs)`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`except ImportError:`
chore: Update Dockerfile to install chromium-chromedriver and spacy library 2024-05-18 09:16:52 +00:00			`print("ImportError: Module not found.")`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`raise HTTPException(status_code=400, detail=f"Module {module_name} not found.")`
			`except AttributeError:`
chore: Update Dockerfile to install chromium-chromedriver and spacy library 2024-05-18 09:16:52 +00:00			`print("AttributeError: Class not found.")`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`raise HTTPException(status_code=400, detail=f"Class {class_name} not found in {module_name}.")`

Initial Commit 2024-05-09 19:10:25 +08:00			`@app.post("/crawl")`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`async def crawl_urls(crawl_request: CrawlRequest, request: Request):`
chore: Update Dockerfile to install chromium-chromedriver and spacy library 2024-05-18 09:16:52 +00:00			`logging.debug(f"[LOG] Crawl request for URL: {crawl_request.urls}")`
Initial Commit 2024-05-09 19:10:25 +08:00			`global current_requests`
			`async with lock:`
			`if current_requests >= MAX_CONCURRENT_REQUESTS:`
			`raise HTTPException(status_code=429, detail="Too many requests - please try again later.")`
			`current_requests += 1`

			`try:`
chore: Update Dockerfile to install chromium-chromedriver and spacy library 2024-05-18 09:16:52 +00:00			`logging.debug("[LOG] Loading extraction and chunking strategies...")`
chore: Update extraction strategy to support GPU, MPS, and CPU, add batch processing for CPU devices 2024-05-19 16:18:58 +00:00			`crawl_request.extraction_strategy_args['verbose'] = True`
			`crawl_request.chunking_strategy_args['verbose'] = True`

Update: - Debug - Refactor code for new version 2024-05-16 17:31:44 +08:00			`extraction_strategy = import_strategy("crawl4ai.extraction_strategy", crawl_request.extraction_strategy, **crawl_request.extraction_strategy_args)`
			`chunking_strategy = import_strategy("crawl4ai.chunking_strategy", crawl_request.chunking_strategy, **crawl_request.chunking_strategy_args)`
Initial Commit 2024-05-09 19:10:25 +08:00
			`# Use ThreadPoolExecutor to run the synchronous WebCrawler in async manner`
chore: Update Dockerfile to install chromium-chromedriver and spacy library 2024-05-18 09:16:52 +00:00			`logging.debug("[LOG] Running the WebCrawler...")`
Initial Commit 2024-05-09 19:10:25 +08:00			`with ThreadPoolExecutor() as executor:`
			`loop = asyncio.get_event_loop()`
			`futures = [`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`loop.run_in_executor(`
			`executor,`
			`get_crawler().run,`
			`str(url),`
			`crawl_request.word_count_threshold,`
			`extraction_strategy,`
			`chunking_strategy,`
			`crawl_request.bypass_cache,`
			`crawl_request.css_selector,`
feat: Add screenshot functionality to crawl_urls The code changes in this commit add the `screenshot` parameter to the `crawl_urls` function in `main.py`. This allows users to specify whether they want to take a screenshot of the page during the crawling process. The default value is `False`. This commit message follows the established convention of starting with a type (feat for feature) and providing a concise and descriptive summary of the changes made. 2024-06-07 15:23:32 +08:00			`crawl_request.screenshot,`
- User agent - Extract Links - Extract Metadata - Update Readme - Update REST API document 2024-06-08 17:59:42 +08:00			`crawl_request.user_agent,`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`crawl_request.verbose`
			`)`
			`for url in crawl_request.urls`
Initial Commit 2024-05-09 19:10:25 +08:00			`]`
			`results = await asyncio.gather(*futures)`

			`# if include_raw_html is False, remove the raw HTML content from the results`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00			`if not crawl_request.include_raw_html:`
Initial Commit 2024-05-09 19:10:25 +08:00			`for result in results:`
			`result.html = None`

v0.2.3: - Extract all media tags - Take screenshot of the page 2024-06-07 15:23:13 +08:00			`return {"results": [result.model_dump() for result in results]}`
Initial Commit 2024-05-09 19:10:25 +08:00			`finally:`
			`async with lock:`
			`current_requests -= 1`
- Test all methods - Update index.hml - Update Readme - Resolve some bugs 2024-05-14 21:27:41 +08:00
			`@app.get("/strategies/extraction", response_class=JSONResponse)`
			`async def get_extraction_strategies():`
			`with open(f"{__location__}/docs/extraction_strategies.json", "r") as file:`
			`return JSONResponse(content=file.read())`

			`@app.get("/strategies/chunking", response_class=JSONResponse)`
			`async def get_chunking_strategies():`
			`with open(f"{__location__}/docs/chunking_strategies.json", "r") as file:`
			`return JSONResponse(content=file.read())`
Fixed: - Redirect "/" to mkdocs 2024-06-22 20:54:32 +08:00

Initial Commit 2024-05-09 19:10:25 +08:00			`if __name__ == "__main__":`
			`import uvicorn`
UPDATE DOCUMENTS 2024-06-30 00:34:02 +08:00			`uvicorn.run(app, host="0.0.0.0", port=8888)`