Update .gitignore to ignore temporary and test directories
This commit is contained in:
parent
64190dd0c4
commit
8b6e88c85c
7
.gitignore
vendored
7
.gitignore
vendored
@ -191,4 +191,9 @@ ec2*
|
||||
|
||||
update_changelog.sh
|
||||
|
||||
.DS_Store
|
||||
.DS_Store
|
||||
docs/.DS_Store
|
||||
tmp/
|
||||
test_env/
|
||||
**/.DS_Store
|
||||
**/.DS_Store
|
67
Dockerfile
67
Dockerfile
@ -1,67 +0,0 @@
|
||||
# First stage: Build and install dependencies
|
||||
FROM python:3.10-slim-bookworm
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
# Define build arguments
|
||||
ARG INSTALL_OPTION=default
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
git \
|
||||
curl \
|
||||
unzip \
|
||||
gnupg \
|
||||
xvfb \
|
||||
ca-certificates \
|
||||
apt-transport-https \
|
||||
software-properties-common && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy the application code
|
||||
COPY . .
|
||||
|
||||
# Install Crawl4AI using the local setup.py with the specified option
|
||||
# and download models only for torch, transformer, or all options
|
||||
RUN if [ "$INSTALL_OPTION" = "all" ]; then \
|
||||
pip install --no-cache-dir .[all] && \
|
||||
crawl4ai-download-models; \
|
||||
elif [ "$INSTALL_OPTION" = "torch" ]; then \
|
||||
pip install --no-cache-dir .[torch] && \
|
||||
crawl4ai-download-models; \
|
||||
elif [ "$INSTALL_OPTION" = "transformer" ]; then \
|
||||
pip install --no-cache-dir .[transformer] && \
|
||||
crawl4ai-download-models; \
|
||||
else \
|
||||
pip install --no-cache-dir .; \
|
||||
fi
|
||||
|
||||
# Install Google Chrome
|
||||
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
|
||||
sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
|
||||
apt-get update && \
|
||||
apt-get install -y google-chrome-stable
|
||||
|
||||
# Set environment to use Chrome properly
|
||||
ENV CHROME_BIN=/usr/bin/google-chrome \
|
||||
DISPLAY=:99 \
|
||||
DBUS_SESSION_BUS_ADDRESS=/dev/null \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
# Ensure the PATH environment variable includes the location of the installed packages
|
||||
ENV PATH=/opt/conda/bin:$PATH
|
||||
|
||||
# Make port 80 available to the world outside this container
|
||||
EXPOSE 80
|
||||
|
||||
# Install mkdocs
|
||||
RUN pip install mkdocs mkdocs-terminal
|
||||
|
||||
# Call mkdocs to build the documentation
|
||||
RUN mkdocs build
|
||||
|
||||
# Run uvicorn
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
@ -1,44 +0,0 @@
|
||||
# Use an official Python runtime as a parent image
|
||||
FROM python:3.10-slim
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
# Copy the current directory contents into the container at /usr/src/app
|
||||
COPY . .
|
||||
|
||||
# Install any needed packages specified in requirements.txt
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Install dependencies for Chrome and ChromeDriver
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
xvfb \
|
||||
unzip \
|
||||
curl \
|
||||
gnupg2 \
|
||||
ca-certificates \
|
||||
apt-transport-https \
|
||||
software-properties-common \
|
||||
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
||||
&& echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y google-chrome-stable \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& apt install chromium-chromedriver -y
|
||||
|
||||
# Install spacy library using pip
|
||||
RUN pip install spacy
|
||||
|
||||
# Set display port and dbus env to avoid hanging
|
||||
ENV DISPLAY=:99
|
||||
ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
|
||||
|
||||
# Make port 80 available to the world outside this container
|
||||
EXPOSE 80
|
||||
|
||||
# Define environment variable
|
||||
ENV PYTHONUNBUFFERED 1
|
||||
|
||||
# Run uvicorn
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
@ -3,7 +3,7 @@
|
||||
from .async_webcrawler import AsyncWebCrawler
|
||||
from .models import CrawlResult
|
||||
|
||||
__version__ = "0.3.1"
|
||||
__version__ = "0.3.2"
|
||||
|
||||
__all__ = [
|
||||
"AsyncWebCrawler",
|
||||
|
@ -1,10 +0,0 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
web:
|
||||
build: .
|
||||
command: uvicorn main:app --host 0.0.0.0 --port 80 --workers $(nproc)
|
||||
ports:
|
||||
- "80:80"
|
||||
environment:
|
||||
- PYTHONUNBUFFERED=1
|
@ -3,9 +3,9 @@ import os, sys
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
|
||||
import asyncio
|
||||
import nest_asyncio
|
||||
# import nest_asyncio
|
||||
# nest_asyncio.apply()
|
||||
|
||||
nest_asyncio.apply()
|
||||
import time
|
||||
import json
|
||||
import os
|
||||
|
2
requirements-dev.txt
Normal file
2
requirements-dev.txt
Normal file
@ -0,0 +1,2 @@
|
||||
-r requirements.txt
|
||||
pytest
|
@ -1,13 +0,0 @@
|
||||
aiohttp
|
||||
aiosqlite
|
||||
bs4
|
||||
fastapi
|
||||
html2text
|
||||
httpx
|
||||
pydantic
|
||||
python-dotenv
|
||||
requests
|
||||
rich
|
||||
selenium
|
||||
uvicorn
|
||||
chromedriver-autoinstaller
|
@ -1,66 +1,12 @@
|
||||
aiohappyeyeballs==2.4.0
|
||||
aiohttp==3.10.5
|
||||
aiosignal==1.3.1
|
||||
aiosqlite==0.20.0
|
||||
annotated-types==0.7.0
|
||||
anyio==4.6.0
|
||||
async-timeout==4.0.3
|
||||
attrs==24.2.0
|
||||
beautifulsoup4==4.12.3
|
||||
certifi==2024.8.30
|
||||
charset-normalizer==3.3.2
|
||||
click==8.1.7
|
||||
distro==1.9.0
|
||||
exceptiongroup==1.2.2
|
||||
filelock==3.16.1
|
||||
frozenlist==1.4.1
|
||||
fsspec==2024.9.0
|
||||
greenlet==3.0.3
|
||||
h11==0.14.0
|
||||
html2text==2024.2.26
|
||||
httpcore==1.0.5
|
||||
httpx==0.27.2
|
||||
huggingface-hub==0.25.1
|
||||
idna==3.10
|
||||
importlib_metadata==8.5.0
|
||||
Jinja2==3.1.4
|
||||
jiter==0.5.0
|
||||
jsonschema==4.23.0
|
||||
jsonschema-specifications==2023.12.1
|
||||
litellm==1.48.0
|
||||
lxml==5.3.0
|
||||
MarkupSafe==2.1.5
|
||||
multidict==6.1.0
|
||||
nest-asyncio==1.6.0
|
||||
litellm==1.48.0
|
||||
numpy==2.1.1
|
||||
openai==1.47.1
|
||||
outcome==1.3.0.post0
|
||||
packaging==24.1
|
||||
pillow==10.4.0
|
||||
playwright==1.47.0
|
||||
psutil==6.0.0
|
||||
pydantic==2.9.2
|
||||
pydantic_core==2.23.4
|
||||
pyee==12.0.0
|
||||
PySocks==1.7.1
|
||||
python-dotenv==1.0.1
|
||||
PyYAML==6.0.2
|
||||
referencing==0.35.1
|
||||
regex==2024.9.11
|
||||
requests==2.32.3
|
||||
rpds-py==0.20.0
|
||||
selenium==4.25.0
|
||||
sniffio==1.3.1
|
||||
sortedcontainers==2.4.0
|
||||
soupsieve==2.6
|
||||
tiktoken==0.7.0
|
||||
tokenizers==0.20.0
|
||||
tqdm==4.66.5
|
||||
trio==0.26.2
|
||||
trio-websocket==0.11.1
|
||||
typing_extensions==4.12.2
|
||||
urllib3==2.2.3
|
||||
websocket-client==1.8.0
|
||||
wsproto==1.2.0
|
||||
yarl==1.12.1
|
||||
zipp==3.20.2
|
||||
PyYAML==6.0.2
|
||||
beautifulsoup4==4.12.3
|
||||
psutil==6.0.0
|
12
setup.py
12
setup.py
@ -29,11 +29,11 @@ with open("crawl4ai/__init__.py") as f:
|
||||
break
|
||||
|
||||
# Define the requirements for different environments
|
||||
default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn", "selenium"))]
|
||||
torch_requirements = [req for req in requirements if req.startswith(("torch", "nltk", "spacy", "scikit-learn", "numpy"))]
|
||||
transformer_requirements = [req for req in requirements if req.startswith(("transformers", "tokenizers", "onnxruntime"))]
|
||||
sync_requirements = ["selenium"]
|
||||
default_requirements = requirements
|
||||
torch_requirements = ["torch", "nltk", "spacy", "scikit-learn"]
|
||||
transformer_requirements = ["transformers", "tokenizers", "onnxruntime"]
|
||||
cosine_similarity_requirements = ["torch", "transformers", "nltk", "spacy"]
|
||||
sync_requirements = ["selenium"]
|
||||
|
||||
def post_install():
|
||||
print("Running post-installation setup...")
|
||||
@ -65,9 +65,9 @@ setup(
|
||||
extras_require={
|
||||
"torch": torch_requirements,
|
||||
"transformer": transformer_requirements,
|
||||
"sync": sync_requirements,
|
||||
"cosine": cosine_similarity_requirements,
|
||||
"all": requirements + sync_requirements + cosine_similarity_requirements,
|
||||
"sync": sync_requirements,
|
||||
"all": default_requirements + torch_requirements + transformer_requirements + cosine_similarity_requirements + sync_requirements,
|
||||
},
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
|
@ -27,21 +27,21 @@ async def test_regex_chunking():
|
||||
chunks = json.loads(result.extracted_content)
|
||||
assert len(chunks) > 1 # Ensure multiple chunks were created
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cosine_strategy():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
url = "https://www.nbcnews.com/business"
|
||||
extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
extraction_strategy=extraction_strategy,
|
||||
bypass_cache=True
|
||||
)
|
||||
assert result.success
|
||||
assert result.extracted_content
|
||||
extracted_data = json.loads(result.extracted_content)
|
||||
assert len(extracted_data) > 0
|
||||
assert all('tags' in item for item in extracted_data)
|
||||
# @pytest.mark.asyncio
|
||||
# async def test_cosine_strategy():
|
||||
# async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# url = "https://www.nbcnews.com/business"
|
||||
# extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
|
||||
# result = await crawler.arun(
|
||||
# url=url,
|
||||
# extraction_strategy=extraction_strategy,
|
||||
# bypass_cache=True
|
||||
# )
|
||||
# assert result.success
|
||||
# assert result.extracted_content
|
||||
# extracted_data = json.loads(result.extracted_content)
|
||||
# assert len(extracted_data) > 0
|
||||
# assert all('tags' in item for item in extracted_data)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_llm_extraction_strategy():
|
||||
@ -63,24 +63,24 @@ async def test_llm_extraction_strategy():
|
||||
assert len(extracted_data) > 0
|
||||
assert all('content' in item for item in extracted_data)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_combined_chunking_and_extraction():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
url = "https://www.nbcnews.com/business"
|
||||
chunking_strategy = RegexChunking(patterns=["\n\n"])
|
||||
extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
chunking_strategy=chunking_strategy,
|
||||
extraction_strategy=extraction_strategy,
|
||||
bypass_cache=True
|
||||
)
|
||||
assert result.success
|
||||
assert result.extracted_content
|
||||
extracted_data = json.loads(result.extracted_content)
|
||||
assert len(extracted_data) > 0
|
||||
assert all('tags' in item for item in extracted_data)
|
||||
assert all('content' in item for item in extracted_data)
|
||||
# @pytest.mark.asyncio
|
||||
# async def test_combined_chunking_and_extraction():
|
||||
# async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# url = "https://www.nbcnews.com/business"
|
||||
# chunking_strategy = RegexChunking(patterns=["\n\n"])
|
||||
# extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
|
||||
# result = await crawler.arun(
|
||||
# url=url,
|
||||
# chunking_strategy=chunking_strategy,
|
||||
# extraction_strategy=extraction_strategy,
|
||||
# bypass_cache=True
|
||||
# )
|
||||
# assert result.success
|
||||
# assert result.extracted_content
|
||||
# extracted_data = json.loads(result.extracted_content)
|
||||
# assert len(extracted_data) > 0
|
||||
# assert all('tags' in item for item in extracted_data)
|
||||
# assert all('content' in item for item in extracted_data)
|
||||
|
||||
# Entry point for debugging
|
||||
if __name__ == "__main__":
|
||||
|
Loading…
x
Reference in New Issue
Block a user