Update .gitignore to ignore temporary and test directories

This commit is contained in:
unclecode 2024-09-26 15:09:49 +08:00
parent 64190dd0c4
commit 8b6e88c85c
11 changed files with 54 additions and 235 deletions

7
.gitignore vendored
View File

@ -191,4 +191,9 @@ ec2*
update_changelog.sh update_changelog.sh
.DS_Store .DS_Store
docs/.DS_Store
tmp/
test_env/
**/.DS_Store
**/.DS_Store

View File

@ -1,67 +0,0 @@
# First stage: Build and install dependencies
FROM python:3.10-slim-bookworm
# Set the working directory in the container
WORKDIR /usr/src/app
# Define build arguments
ARG INSTALL_OPTION=default
# Install build dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
wget \
git \
curl \
unzip \
gnupg \
xvfb \
ca-certificates \
apt-transport-https \
software-properties-common && \
rm -rf /var/lib/apt/lists/*
# Copy the application code
COPY . .
# Install Crawl4AI using the local setup.py with the specified option
# and download models only for torch, transformer, or all options
RUN if [ "$INSTALL_OPTION" = "all" ]; then \
pip install --no-cache-dir .[all] && \
crawl4ai-download-models; \
elif [ "$INSTALL_OPTION" = "torch" ]; then \
pip install --no-cache-dir .[torch] && \
crawl4ai-download-models; \
elif [ "$INSTALL_OPTION" = "transformer" ]; then \
pip install --no-cache-dir .[transformer] && \
crawl4ai-download-models; \
else \
pip install --no-cache-dir .; \
fi
# Install Google Chrome
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
apt-get update && \
apt-get install -y google-chrome-stable
# Set environment to use Chrome properly
ENV CHROME_BIN=/usr/bin/google-chrome \
DISPLAY=:99 \
DBUS_SESSION_BUS_ADDRESS=/dev/null \
PYTHONUNBUFFERED=1
# Ensure the PATH environment variable includes the location of the installed packages
ENV PATH=/opt/conda/bin:$PATH
# Make port 80 available to the world outside this container
EXPOSE 80
# Install mkdocs
RUN pip install mkdocs mkdocs-terminal
# Call mkdocs to build the documentation
RUN mkdocs build
# Run uvicorn
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]

View File

@ -1,44 +0,0 @@
# Use an official Python runtime as a parent image
FROM python:3.10-slim
# Set the working directory in the container
WORKDIR /usr/src/app
# Copy the current directory contents into the container at /usr/src/app
COPY . .
# Install any needed packages specified in requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
# Install dependencies for Chrome and ChromeDriver
RUN apt-get update && apt-get install -y --no-install-recommends \
wget \
xvfb \
unzip \
curl \
gnupg2 \
ca-certificates \
apt-transport-https \
software-properties-common \
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
&& echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
&& apt-get update \
&& apt-get install -y google-chrome-stable \
&& rm -rf /var/lib/apt/lists/* \
&& apt install chromium-chromedriver -y
# Install spacy library using pip
RUN pip install spacy
# Set display port and dbus env to avoid hanging
ENV DISPLAY=:99
ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
# Make port 80 available to the world outside this container
EXPOSE 80
# Define environment variable
ENV PYTHONUNBUFFERED 1
# Run uvicorn
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]

View File

@ -3,7 +3,7 @@
from .async_webcrawler import AsyncWebCrawler from .async_webcrawler import AsyncWebCrawler
from .models import CrawlResult from .models import CrawlResult
__version__ = "0.3.1" __version__ = "0.3.2"
__all__ = [ __all__ = [
"AsyncWebCrawler", "AsyncWebCrawler",

View File

@ -1,10 +0,0 @@
version: '3.8'
services:
web:
build: .
command: uvicorn main:app --host 0.0.0.0 --port 80 --workers $(nproc)
ports:
- "80:80"
environment:
- PYTHONUNBUFFERED=1

View File

@ -3,9 +3,9 @@ import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
import asyncio import asyncio
import nest_asyncio # import nest_asyncio
# nest_asyncio.apply()
nest_asyncio.apply()
import time import time
import json import json
import os import os

2
requirements-dev.txt Normal file
View File

@ -0,0 +1,2 @@
-r requirements.txt
pytest

View File

@ -1,13 +0,0 @@
aiohttp
aiosqlite
bs4
fastapi
html2text
httpx
pydantic
python-dotenv
requests
rich
selenium
uvicorn
chromedriver-autoinstaller

View File

@ -1,66 +1,12 @@
aiohappyeyeballs==2.4.0
aiohttp==3.10.5
aiosignal==1.3.1
aiosqlite==0.20.0 aiosqlite==0.20.0
annotated-types==0.7.0
anyio==4.6.0
async-timeout==4.0.3
attrs==24.2.0
beautifulsoup4==4.12.3
certifi==2024.8.30
charset-normalizer==3.3.2
click==8.1.7
distro==1.9.0
exceptiongroup==1.2.2
filelock==3.16.1
frozenlist==1.4.1
fsspec==2024.9.0
greenlet==3.0.3
h11==0.14.0
html2text==2024.2.26 html2text==2024.2.26
httpcore==1.0.5
httpx==0.27.2
huggingface-hub==0.25.1
idna==3.10
importlib_metadata==8.5.0
Jinja2==3.1.4
jiter==0.5.0
jsonschema==4.23.0
jsonschema-specifications==2023.12.1
litellm==1.48.0
lxml==5.3.0 lxml==5.3.0
MarkupSafe==2.1.5 litellm==1.48.0
multidict==6.1.0
nest-asyncio==1.6.0
numpy==2.1.1 numpy==2.1.1
openai==1.47.1
outcome==1.3.0.post0
packaging==24.1
pillow==10.4.0 pillow==10.4.0
playwright==1.47.0 playwright==1.47.0
psutil==6.0.0
pydantic==2.9.2
pydantic_core==2.23.4
pyee==12.0.0
PySocks==1.7.1
python-dotenv==1.0.1 python-dotenv==1.0.1
PyYAML==6.0.2
referencing==0.35.1
regex==2024.9.11
requests==2.32.3 requests==2.32.3
rpds-py==0.20.0 PyYAML==6.0.2
selenium==4.25.0 beautifulsoup4==4.12.3
sniffio==1.3.1 psutil==6.0.0
sortedcontainers==2.4.0
soupsieve==2.6
tiktoken==0.7.0
tokenizers==0.20.0
tqdm==4.66.5
trio==0.26.2
trio-websocket==0.11.1
typing_extensions==4.12.2
urllib3==2.2.3
websocket-client==1.8.0
wsproto==1.2.0
yarl==1.12.1
zipp==3.20.2

View File

@ -29,11 +29,11 @@ with open("crawl4ai/__init__.py") as f:
break break
# Define the requirements for different environments # Define the requirements for different environments
default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn", "selenium"))] default_requirements = requirements
torch_requirements = [req for req in requirements if req.startswith(("torch", "nltk", "spacy", "scikit-learn", "numpy"))] torch_requirements = ["torch", "nltk", "spacy", "scikit-learn"]
transformer_requirements = [req for req in requirements if req.startswith(("transformers", "tokenizers", "onnxruntime"))] transformer_requirements = ["transformers", "tokenizers", "onnxruntime"]
sync_requirements = ["selenium"]
cosine_similarity_requirements = ["torch", "transformers", "nltk", "spacy"] cosine_similarity_requirements = ["torch", "transformers", "nltk", "spacy"]
sync_requirements = ["selenium"]
def post_install(): def post_install():
print("Running post-installation setup...") print("Running post-installation setup...")
@ -65,9 +65,9 @@ setup(
extras_require={ extras_require={
"torch": torch_requirements, "torch": torch_requirements,
"transformer": transformer_requirements, "transformer": transformer_requirements,
"sync": sync_requirements,
"cosine": cosine_similarity_requirements, "cosine": cosine_similarity_requirements,
"all": requirements + sync_requirements + cosine_similarity_requirements, "sync": sync_requirements,
"all": default_requirements + torch_requirements + transformer_requirements + cosine_similarity_requirements + sync_requirements,
}, },
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [

View File

@ -27,21 +27,21 @@ async def test_regex_chunking():
chunks = json.loads(result.extracted_content) chunks = json.loads(result.extracted_content)
assert len(chunks) > 1 # Ensure multiple chunks were created assert len(chunks) > 1 # Ensure multiple chunks were created
@pytest.mark.asyncio # @pytest.mark.asyncio
async def test_cosine_strategy(): # async def test_cosine_strategy():
async with AsyncWebCrawler(verbose=True) as crawler: # async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://www.nbcnews.com/business" # url = "https://www.nbcnews.com/business"
extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3) # extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
result = await crawler.arun( # result = await crawler.arun(
url=url, # url=url,
extraction_strategy=extraction_strategy, # extraction_strategy=extraction_strategy,
bypass_cache=True # bypass_cache=True
) # )
assert result.success # assert result.success
assert result.extracted_content # assert result.extracted_content
extracted_data = json.loads(result.extracted_content) # extracted_data = json.loads(result.extracted_content)
assert len(extracted_data) > 0 # assert len(extracted_data) > 0
assert all('tags' in item for item in extracted_data) # assert all('tags' in item for item in extracted_data)
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_llm_extraction_strategy(): async def test_llm_extraction_strategy():
@ -63,24 +63,24 @@ async def test_llm_extraction_strategy():
assert len(extracted_data) > 0 assert len(extracted_data) > 0
assert all('content' in item for item in extracted_data) assert all('content' in item for item in extracted_data)
@pytest.mark.asyncio # @pytest.mark.asyncio
async def test_combined_chunking_and_extraction(): # async def test_combined_chunking_and_extraction():
async with AsyncWebCrawler(verbose=True) as crawler: # async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://www.nbcnews.com/business" # url = "https://www.nbcnews.com/business"
chunking_strategy = RegexChunking(patterns=["\n\n"]) # chunking_strategy = RegexChunking(patterns=["\n\n"])
extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3) # extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
result = await crawler.arun( # result = await crawler.arun(
url=url, # url=url,
chunking_strategy=chunking_strategy, # chunking_strategy=chunking_strategy,
extraction_strategy=extraction_strategy, # extraction_strategy=extraction_strategy,
bypass_cache=True # bypass_cache=True
) # )
assert result.success # assert result.success
assert result.extracted_content # assert result.extracted_content
extracted_data = json.loads(result.extracted_content) # extracted_data = json.loads(result.extracted_content)
assert len(extracted_data) > 0 # assert len(extracted_data) > 0
assert all('tags' in item for item in extracted_data) # assert all('tags' in item for item in extracted_data)
assert all('content' in item for item in extracted_data) # assert all('content' in item for item in extracted_data)
# Entry point for debugging # Entry point for debugging
if __name__ == "__main__": if __name__ == "__main__":