Update .gitignore to ignore temporary and test directories

This commit is contained in:
unclecode 2024-09-26 15:09:49 +08:00
parent 64190dd0c4
commit 8b6e88c85c
11 changed files with 54 additions and 235 deletions

7
.gitignore vendored
View File

@ -191,4 +191,9 @@ ec2*
update_changelog.sh
.DS_Store
.DS_Store
docs/.DS_Store
tmp/
test_env/
**/.DS_Store
**/.DS_Store

View File

@ -1,67 +0,0 @@
# First stage: Build and install dependencies
FROM python:3.10-slim-bookworm
# Set the working directory in the container
WORKDIR /usr/src/app
# Define build arguments
ARG INSTALL_OPTION=default
# Install build dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
wget \
git \
curl \
unzip \
gnupg \
xvfb \
ca-certificates \
apt-transport-https \
software-properties-common && \
rm -rf /var/lib/apt/lists/*
# Copy the application code
COPY . .
# Install Crawl4AI using the local setup.py with the specified option
# and download models only for torch, transformer, or all options
RUN if [ "$INSTALL_OPTION" = "all" ]; then \
pip install --no-cache-dir .[all] && \
crawl4ai-download-models; \
elif [ "$INSTALL_OPTION" = "torch" ]; then \
pip install --no-cache-dir .[torch] && \
crawl4ai-download-models; \
elif [ "$INSTALL_OPTION" = "transformer" ]; then \
pip install --no-cache-dir .[transformer] && \
crawl4ai-download-models; \
else \
pip install --no-cache-dir .; \
fi
# Install Google Chrome
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
apt-get update && \
apt-get install -y google-chrome-stable
# Set environment to use Chrome properly
ENV CHROME_BIN=/usr/bin/google-chrome \
DISPLAY=:99 \
DBUS_SESSION_BUS_ADDRESS=/dev/null \
PYTHONUNBUFFERED=1
# Ensure the PATH environment variable includes the location of the installed packages
ENV PATH=/opt/conda/bin:$PATH
# Make port 80 available to the world outside this container
EXPOSE 80
# Install mkdocs
RUN pip install mkdocs mkdocs-terminal
# Call mkdocs to build the documentation
RUN mkdocs build
# Run uvicorn
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]

View File

@ -1,44 +0,0 @@
# Use an official Python runtime as a parent image
FROM python:3.10-slim
# Set the working directory in the container
WORKDIR /usr/src/app
# Copy the current directory contents into the container at /usr/src/app
COPY . .
# Install any needed packages specified in requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
# Install dependencies for Chrome and ChromeDriver
RUN apt-get update && apt-get install -y --no-install-recommends \
wget \
xvfb \
unzip \
curl \
gnupg2 \
ca-certificates \
apt-transport-https \
software-properties-common \
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
&& echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
&& apt-get update \
&& apt-get install -y google-chrome-stable \
&& rm -rf /var/lib/apt/lists/* \
&& apt install chromium-chromedriver -y
# Install spacy library using pip
RUN pip install spacy
# Set display port and dbus env to avoid hanging
ENV DISPLAY=:99
ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
# Make port 80 available to the world outside this container
EXPOSE 80
# Define environment variable
ENV PYTHONUNBUFFERED 1
# Run uvicorn
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]

View File

@ -3,7 +3,7 @@
from .async_webcrawler import AsyncWebCrawler
from .models import CrawlResult
__version__ = "0.3.1"
__version__ = "0.3.2"
__all__ = [
"AsyncWebCrawler",

View File

@ -1,10 +0,0 @@
version: '3.8'
services:
web:
build: .
command: uvicorn main:app --host 0.0.0.0 --port 80 --workers $(nproc)
ports:
- "80:80"
environment:
- PYTHONUNBUFFERED=1

View File

@ -3,9 +3,9 @@ import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
import asyncio
import nest_asyncio
# import nest_asyncio
# nest_asyncio.apply()
nest_asyncio.apply()
import time
import json
import os

2
requirements-dev.txt Normal file
View File

@ -0,0 +1,2 @@
-r requirements.txt
pytest

View File

@ -1,13 +0,0 @@
aiohttp
aiosqlite
bs4
fastapi
html2text
httpx
pydantic
python-dotenv
requests
rich
selenium
uvicorn
chromedriver-autoinstaller

View File

@ -1,66 +1,12 @@
aiohappyeyeballs==2.4.0
aiohttp==3.10.5
aiosignal==1.3.1
aiosqlite==0.20.0
annotated-types==0.7.0
anyio==4.6.0
async-timeout==4.0.3
attrs==24.2.0
beautifulsoup4==4.12.3
certifi==2024.8.30
charset-normalizer==3.3.2
click==8.1.7
distro==1.9.0
exceptiongroup==1.2.2
filelock==3.16.1
frozenlist==1.4.1
fsspec==2024.9.0
greenlet==3.0.3
h11==0.14.0
html2text==2024.2.26
httpcore==1.0.5
httpx==0.27.2
huggingface-hub==0.25.1
idna==3.10
importlib_metadata==8.5.0
Jinja2==3.1.4
jiter==0.5.0
jsonschema==4.23.0
jsonschema-specifications==2023.12.1
litellm==1.48.0
lxml==5.3.0
MarkupSafe==2.1.5
multidict==6.1.0
nest-asyncio==1.6.0
litellm==1.48.0
numpy==2.1.1
openai==1.47.1
outcome==1.3.0.post0
packaging==24.1
pillow==10.4.0
playwright==1.47.0
psutil==6.0.0
pydantic==2.9.2
pydantic_core==2.23.4
pyee==12.0.0
PySocks==1.7.1
python-dotenv==1.0.1
PyYAML==6.0.2
referencing==0.35.1
regex==2024.9.11
requests==2.32.3
rpds-py==0.20.0
selenium==4.25.0
sniffio==1.3.1
sortedcontainers==2.4.0
soupsieve==2.6
tiktoken==0.7.0
tokenizers==0.20.0
tqdm==4.66.5
trio==0.26.2
trio-websocket==0.11.1
typing_extensions==4.12.2
urllib3==2.2.3
websocket-client==1.8.0
wsproto==1.2.0
yarl==1.12.1
zipp==3.20.2
PyYAML==6.0.2
beautifulsoup4==4.12.3
psutil==6.0.0

View File

@ -29,11 +29,11 @@ with open("crawl4ai/__init__.py") as f:
break
# Define the requirements for different environments
default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn", "selenium"))]
torch_requirements = [req for req in requirements if req.startswith(("torch", "nltk", "spacy", "scikit-learn", "numpy"))]
transformer_requirements = [req for req in requirements if req.startswith(("transformers", "tokenizers", "onnxruntime"))]
sync_requirements = ["selenium"]
default_requirements = requirements
torch_requirements = ["torch", "nltk", "spacy", "scikit-learn"]
transformer_requirements = ["transformers", "tokenizers", "onnxruntime"]
cosine_similarity_requirements = ["torch", "transformers", "nltk", "spacy"]
sync_requirements = ["selenium"]
def post_install():
print("Running post-installation setup...")
@ -65,9 +65,9 @@ setup(
extras_require={
"torch": torch_requirements,
"transformer": transformer_requirements,
"sync": sync_requirements,
"cosine": cosine_similarity_requirements,
"all": requirements + sync_requirements + cosine_similarity_requirements,
"sync": sync_requirements,
"all": default_requirements + torch_requirements + transformer_requirements + cosine_similarity_requirements + sync_requirements,
},
entry_points={
'console_scripts': [

View File

@ -27,21 +27,21 @@ async def test_regex_chunking():
chunks = json.loads(result.extracted_content)
assert len(chunks) > 1 # Ensure multiple chunks were created
@pytest.mark.asyncio
async def test_cosine_strategy():
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://www.nbcnews.com/business"
extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
result = await crawler.arun(
url=url,
extraction_strategy=extraction_strategy,
bypass_cache=True
)
assert result.success
assert result.extracted_content
extracted_data = json.loads(result.extracted_content)
assert len(extracted_data) > 0
assert all('tags' in item for item in extracted_data)
# @pytest.mark.asyncio
# async def test_cosine_strategy():
# async with AsyncWebCrawler(verbose=True) as crawler:
# url = "https://www.nbcnews.com/business"
# extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
# result = await crawler.arun(
# url=url,
# extraction_strategy=extraction_strategy,
# bypass_cache=True
# )
# assert result.success
# assert result.extracted_content
# extracted_data = json.loads(result.extracted_content)
# assert len(extracted_data) > 0
# assert all('tags' in item for item in extracted_data)
@pytest.mark.asyncio
async def test_llm_extraction_strategy():
@ -63,24 +63,24 @@ async def test_llm_extraction_strategy():
assert len(extracted_data) > 0
assert all('content' in item for item in extracted_data)
@pytest.mark.asyncio
async def test_combined_chunking_and_extraction():
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://www.nbcnews.com/business"
chunking_strategy = RegexChunking(patterns=["\n\n"])
extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
result = await crawler.arun(
url=url,
chunking_strategy=chunking_strategy,
extraction_strategy=extraction_strategy,
bypass_cache=True
)
assert result.success
assert result.extracted_content
extracted_data = json.loads(result.extracted_content)
assert len(extracted_data) > 0
assert all('tags' in item for item in extracted_data)
assert all('content' in item for item in extracted_data)
# @pytest.mark.asyncio
# async def test_combined_chunking_and_extraction():
# async with AsyncWebCrawler(verbose=True) as crawler:
# url = "https://www.nbcnews.com/business"
# chunking_strategy = RegexChunking(patterns=["\n\n"])
# extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
# result = await crawler.arun(
# url=url,
# chunking_strategy=chunking_strategy,
# extraction_strategy=extraction_strategy,
# bypass_cache=True
# )
# assert result.success
# assert result.extracted_content
# extracted_data = json.loads(result.extracted_content)
# assert len(extracted_data) > 0
# assert all('tags' in item for item in extracted_data)
# assert all('content' in item for item in extracted_data)
# Entry point for debugging
if __name__ == "__main__":