Update .gitignore to ignore temporary and test directories
This commit is contained in:
parent
64190dd0c4
commit
8b6e88c85c
7
.gitignore
vendored
7
.gitignore
vendored
@ -191,4 +191,9 @@ ec2*
|
|||||||
|
|
||||||
update_changelog.sh
|
update_changelog.sh
|
||||||
|
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
docs/.DS_Store
|
||||||
|
tmp/
|
||||||
|
test_env/
|
||||||
|
**/.DS_Store
|
||||||
|
**/.DS_Store
|
67
Dockerfile
67
Dockerfile
@ -1,67 +0,0 @@
|
|||||||
# First stage: Build and install dependencies
|
|
||||||
FROM python:3.10-slim-bookworm
|
|
||||||
|
|
||||||
# Set the working directory in the container
|
|
||||||
WORKDIR /usr/src/app
|
|
||||||
|
|
||||||
# Define build arguments
|
|
||||||
ARG INSTALL_OPTION=default
|
|
||||||
|
|
||||||
# Install build dependencies
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y --no-install-recommends \
|
|
||||||
wget \
|
|
||||||
git \
|
|
||||||
curl \
|
|
||||||
unzip \
|
|
||||||
gnupg \
|
|
||||||
xvfb \
|
|
||||||
ca-certificates \
|
|
||||||
apt-transport-https \
|
|
||||||
software-properties-common && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Copy the application code
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Install Crawl4AI using the local setup.py with the specified option
|
|
||||||
# and download models only for torch, transformer, or all options
|
|
||||||
RUN if [ "$INSTALL_OPTION" = "all" ]; then \
|
|
||||||
pip install --no-cache-dir .[all] && \
|
|
||||||
crawl4ai-download-models; \
|
|
||||||
elif [ "$INSTALL_OPTION" = "torch" ]; then \
|
|
||||||
pip install --no-cache-dir .[torch] && \
|
|
||||||
crawl4ai-download-models; \
|
|
||||||
elif [ "$INSTALL_OPTION" = "transformer" ]; then \
|
|
||||||
pip install --no-cache-dir .[transformer] && \
|
|
||||||
crawl4ai-download-models; \
|
|
||||||
else \
|
|
||||||
pip install --no-cache-dir .; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Install Google Chrome
|
|
||||||
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
|
|
||||||
sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
|
|
||||||
apt-get update && \
|
|
||||||
apt-get install -y google-chrome-stable
|
|
||||||
|
|
||||||
# Set environment to use Chrome properly
|
|
||||||
ENV CHROME_BIN=/usr/bin/google-chrome \
|
|
||||||
DISPLAY=:99 \
|
|
||||||
DBUS_SESSION_BUS_ADDRESS=/dev/null \
|
|
||||||
PYTHONUNBUFFERED=1
|
|
||||||
|
|
||||||
# Ensure the PATH environment variable includes the location of the installed packages
|
|
||||||
ENV PATH=/opt/conda/bin:$PATH
|
|
||||||
|
|
||||||
# Make port 80 available to the world outside this container
|
|
||||||
EXPOSE 80
|
|
||||||
|
|
||||||
# Install mkdocs
|
|
||||||
RUN pip install mkdocs mkdocs-terminal
|
|
||||||
|
|
||||||
# Call mkdocs to build the documentation
|
|
||||||
RUN mkdocs build
|
|
||||||
|
|
||||||
# Run uvicorn
|
|
||||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
|
@ -1,44 +0,0 @@
|
|||||||
# Use an official Python runtime as a parent image
|
|
||||||
FROM python:3.10-slim
|
|
||||||
|
|
||||||
# Set the working directory in the container
|
|
||||||
WORKDIR /usr/src/app
|
|
||||||
|
|
||||||
# Copy the current directory contents into the container at /usr/src/app
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Install any needed packages specified in requirements.txt
|
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
|
||||||
|
|
||||||
# Install dependencies for Chrome and ChromeDriver
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
||||||
wget \
|
|
||||||
xvfb \
|
|
||||||
unzip \
|
|
||||||
curl \
|
|
||||||
gnupg2 \
|
|
||||||
ca-certificates \
|
|
||||||
apt-transport-https \
|
|
||||||
software-properties-common \
|
|
||||||
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
|
||||||
&& echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
|
|
||||||
&& apt-get update \
|
|
||||||
&& apt-get install -y google-chrome-stable \
|
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
|
||||||
&& apt install chromium-chromedriver -y
|
|
||||||
|
|
||||||
# Install spacy library using pip
|
|
||||||
RUN pip install spacy
|
|
||||||
|
|
||||||
# Set display port and dbus env to avoid hanging
|
|
||||||
ENV DISPLAY=:99
|
|
||||||
ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
|
|
||||||
|
|
||||||
# Make port 80 available to the world outside this container
|
|
||||||
EXPOSE 80
|
|
||||||
|
|
||||||
# Define environment variable
|
|
||||||
ENV PYTHONUNBUFFERED 1
|
|
||||||
|
|
||||||
# Run uvicorn
|
|
||||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
|
@ -3,7 +3,7 @@
|
|||||||
from .async_webcrawler import AsyncWebCrawler
|
from .async_webcrawler import AsyncWebCrawler
|
||||||
from .models import CrawlResult
|
from .models import CrawlResult
|
||||||
|
|
||||||
__version__ = "0.3.1"
|
__version__ = "0.3.2"
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AsyncWebCrawler",
|
"AsyncWebCrawler",
|
||||||
|
@ -1,10 +0,0 @@
|
|||||||
version: '3.8'
|
|
||||||
|
|
||||||
services:
|
|
||||||
web:
|
|
||||||
build: .
|
|
||||||
command: uvicorn main:app --host 0.0.0.0 --port 80 --workers $(nproc)
|
|
||||||
ports:
|
|
||||||
- "80:80"
|
|
||||||
environment:
|
|
||||||
- PYTHONUNBUFFERED=1
|
|
@ -3,9 +3,9 @@ import os, sys
|
|||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import nest_asyncio
|
# import nest_asyncio
|
||||||
|
# nest_asyncio.apply()
|
||||||
|
|
||||||
nest_asyncio.apply()
|
|
||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
2
requirements-dev.txt
Normal file
2
requirements-dev.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
-r requirements.txt
|
||||||
|
pytest
|
@ -1,13 +0,0 @@
|
|||||||
aiohttp
|
|
||||||
aiosqlite
|
|
||||||
bs4
|
|
||||||
fastapi
|
|
||||||
html2text
|
|
||||||
httpx
|
|
||||||
pydantic
|
|
||||||
python-dotenv
|
|
||||||
requests
|
|
||||||
rich
|
|
||||||
selenium
|
|
||||||
uvicorn
|
|
||||||
chromedriver-autoinstaller
|
|
@ -1,66 +1,12 @@
|
|||||||
aiohappyeyeballs==2.4.0
|
|
||||||
aiohttp==3.10.5
|
|
||||||
aiosignal==1.3.1
|
|
||||||
aiosqlite==0.20.0
|
aiosqlite==0.20.0
|
||||||
annotated-types==0.7.0
|
|
||||||
anyio==4.6.0
|
|
||||||
async-timeout==4.0.3
|
|
||||||
attrs==24.2.0
|
|
||||||
beautifulsoup4==4.12.3
|
|
||||||
certifi==2024.8.30
|
|
||||||
charset-normalizer==3.3.2
|
|
||||||
click==8.1.7
|
|
||||||
distro==1.9.0
|
|
||||||
exceptiongroup==1.2.2
|
|
||||||
filelock==3.16.1
|
|
||||||
frozenlist==1.4.1
|
|
||||||
fsspec==2024.9.0
|
|
||||||
greenlet==3.0.3
|
|
||||||
h11==0.14.0
|
|
||||||
html2text==2024.2.26
|
html2text==2024.2.26
|
||||||
httpcore==1.0.5
|
|
||||||
httpx==0.27.2
|
|
||||||
huggingface-hub==0.25.1
|
|
||||||
idna==3.10
|
|
||||||
importlib_metadata==8.5.0
|
|
||||||
Jinja2==3.1.4
|
|
||||||
jiter==0.5.0
|
|
||||||
jsonschema==4.23.0
|
|
||||||
jsonschema-specifications==2023.12.1
|
|
||||||
litellm==1.48.0
|
|
||||||
lxml==5.3.0
|
lxml==5.3.0
|
||||||
MarkupSafe==2.1.5
|
litellm==1.48.0
|
||||||
multidict==6.1.0
|
|
||||||
nest-asyncio==1.6.0
|
|
||||||
numpy==2.1.1
|
numpy==2.1.1
|
||||||
openai==1.47.1
|
|
||||||
outcome==1.3.0.post0
|
|
||||||
packaging==24.1
|
|
||||||
pillow==10.4.0
|
pillow==10.4.0
|
||||||
playwright==1.47.0
|
playwright==1.47.0
|
||||||
psutil==6.0.0
|
|
||||||
pydantic==2.9.2
|
|
||||||
pydantic_core==2.23.4
|
|
||||||
pyee==12.0.0
|
|
||||||
PySocks==1.7.1
|
|
||||||
python-dotenv==1.0.1
|
python-dotenv==1.0.1
|
||||||
PyYAML==6.0.2
|
|
||||||
referencing==0.35.1
|
|
||||||
regex==2024.9.11
|
|
||||||
requests==2.32.3
|
requests==2.32.3
|
||||||
rpds-py==0.20.0
|
PyYAML==6.0.2
|
||||||
selenium==4.25.0
|
beautifulsoup4==4.12.3
|
||||||
sniffio==1.3.1
|
psutil==6.0.0
|
||||||
sortedcontainers==2.4.0
|
|
||||||
soupsieve==2.6
|
|
||||||
tiktoken==0.7.0
|
|
||||||
tokenizers==0.20.0
|
|
||||||
tqdm==4.66.5
|
|
||||||
trio==0.26.2
|
|
||||||
trio-websocket==0.11.1
|
|
||||||
typing_extensions==4.12.2
|
|
||||||
urllib3==2.2.3
|
|
||||||
websocket-client==1.8.0
|
|
||||||
wsproto==1.2.0
|
|
||||||
yarl==1.12.1
|
|
||||||
zipp==3.20.2
|
|
12
setup.py
12
setup.py
@ -29,11 +29,11 @@ with open("crawl4ai/__init__.py") as f:
|
|||||||
break
|
break
|
||||||
|
|
||||||
# Define the requirements for different environments
|
# Define the requirements for different environments
|
||||||
default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn", "selenium"))]
|
default_requirements = requirements
|
||||||
torch_requirements = [req for req in requirements if req.startswith(("torch", "nltk", "spacy", "scikit-learn", "numpy"))]
|
torch_requirements = ["torch", "nltk", "spacy", "scikit-learn"]
|
||||||
transformer_requirements = [req for req in requirements if req.startswith(("transformers", "tokenizers", "onnxruntime"))]
|
transformer_requirements = ["transformers", "tokenizers", "onnxruntime"]
|
||||||
sync_requirements = ["selenium"]
|
|
||||||
cosine_similarity_requirements = ["torch", "transformers", "nltk", "spacy"]
|
cosine_similarity_requirements = ["torch", "transformers", "nltk", "spacy"]
|
||||||
|
sync_requirements = ["selenium"]
|
||||||
|
|
||||||
def post_install():
|
def post_install():
|
||||||
print("Running post-installation setup...")
|
print("Running post-installation setup...")
|
||||||
@ -65,9 +65,9 @@ setup(
|
|||||||
extras_require={
|
extras_require={
|
||||||
"torch": torch_requirements,
|
"torch": torch_requirements,
|
||||||
"transformer": transformer_requirements,
|
"transformer": transformer_requirements,
|
||||||
"sync": sync_requirements,
|
|
||||||
"cosine": cosine_similarity_requirements,
|
"cosine": cosine_similarity_requirements,
|
||||||
"all": requirements + sync_requirements + cosine_similarity_requirements,
|
"sync": sync_requirements,
|
||||||
|
"all": default_requirements + torch_requirements + transformer_requirements + cosine_similarity_requirements + sync_requirements,
|
||||||
},
|
},
|
||||||
entry_points={
|
entry_points={
|
||||||
'console_scripts': [
|
'console_scripts': [
|
||||||
|
@ -27,21 +27,21 @@ async def test_regex_chunking():
|
|||||||
chunks = json.loads(result.extracted_content)
|
chunks = json.loads(result.extracted_content)
|
||||||
assert len(chunks) > 1 # Ensure multiple chunks were created
|
assert len(chunks) > 1 # Ensure multiple chunks were created
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
# @pytest.mark.asyncio
|
||||||
async def test_cosine_strategy():
|
# async def test_cosine_strategy():
|
||||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
# async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
url = "https://www.nbcnews.com/business"
|
# url = "https://www.nbcnews.com/business"
|
||||||
extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
|
# extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
|
||||||
result = await crawler.arun(
|
# result = await crawler.arun(
|
||||||
url=url,
|
# url=url,
|
||||||
extraction_strategy=extraction_strategy,
|
# extraction_strategy=extraction_strategy,
|
||||||
bypass_cache=True
|
# bypass_cache=True
|
||||||
)
|
# )
|
||||||
assert result.success
|
# assert result.success
|
||||||
assert result.extracted_content
|
# assert result.extracted_content
|
||||||
extracted_data = json.loads(result.extracted_content)
|
# extracted_data = json.loads(result.extracted_content)
|
||||||
assert len(extracted_data) > 0
|
# assert len(extracted_data) > 0
|
||||||
assert all('tags' in item for item in extracted_data)
|
# assert all('tags' in item for item in extracted_data)
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_llm_extraction_strategy():
|
async def test_llm_extraction_strategy():
|
||||||
@ -63,24 +63,24 @@ async def test_llm_extraction_strategy():
|
|||||||
assert len(extracted_data) > 0
|
assert len(extracted_data) > 0
|
||||||
assert all('content' in item for item in extracted_data)
|
assert all('content' in item for item in extracted_data)
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
# @pytest.mark.asyncio
|
||||||
async def test_combined_chunking_and_extraction():
|
# async def test_combined_chunking_and_extraction():
|
||||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
# async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
url = "https://www.nbcnews.com/business"
|
# url = "https://www.nbcnews.com/business"
|
||||||
chunking_strategy = RegexChunking(patterns=["\n\n"])
|
# chunking_strategy = RegexChunking(patterns=["\n\n"])
|
||||||
extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
|
# extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
|
||||||
result = await crawler.arun(
|
# result = await crawler.arun(
|
||||||
url=url,
|
# url=url,
|
||||||
chunking_strategy=chunking_strategy,
|
# chunking_strategy=chunking_strategy,
|
||||||
extraction_strategy=extraction_strategy,
|
# extraction_strategy=extraction_strategy,
|
||||||
bypass_cache=True
|
# bypass_cache=True
|
||||||
)
|
# )
|
||||||
assert result.success
|
# assert result.success
|
||||||
assert result.extracted_content
|
# assert result.extracted_content
|
||||||
extracted_data = json.loads(result.extracted_content)
|
# extracted_data = json.loads(result.extracted_content)
|
||||||
assert len(extracted_data) > 0
|
# assert len(extracted_data) > 0
|
||||||
assert all('tags' in item for item in extracted_data)
|
# assert all('tags' in item for item in extracted_data)
|
||||||
assert all('content' in item for item in extracted_data)
|
# assert all('content' in item for item in extracted_data)
|
||||||
|
|
||||||
# Entry point for debugging
|
# Entry point for debugging
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
Loading…
x
Reference in New Issue
Block a user