Update README, add manifest, make selenium optional library

This commit is contained in:
unclecode 2024-09-25 16:35:14 +08:00
parent 4d48bd31ca
commit f1eee09cf4
5 changed files with 41 additions and 57 deletions

1
MANIFEST.in Normal file
View File

@ -0,0 +1 @@
include requirements.txt

View File

@ -16,8 +16,6 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/)
✨ Check out the [Demo](https://crawl4ai.com/mkdocs/demo)
## Features ✨
- 🆓 Completely free and open-source
@ -57,21 +55,9 @@ For basic web crawling and scraping tasks:
pip install crawl4ai
```
#### Installation with PyTorch
By default this will install the asynchronous version of Crawl4AI, using Playwright for web crawling.
For advanced text clustering (includes CosineSimilarity cluster strategy):
```bash
pip install crawl4ai[torch]
```
#### Installation with Transformers
For text summarization and Hugging Face models:
```bash
pip install crawl4ai[transformer]
```
👉 Note: The standard version of Crawl4AI uses Playwright for asynchronous crawling. If you encounter an error saying that Playwright is not installed, you can run playwright install. However, this should be done automatically during the setup process.
#### Installation with Synchronous Version
@ -81,50 +67,19 @@ If you need the synchronous version using Selenium:
pip install crawl4ai[sync]
```
#### Installation with Cosine Similarity
#### Development Installation
For using the cosine similarity strategy:
For contributors who plan to modify the source code:
```bash
pip install crawl4ai[cosine]
```
#### Full Installation
For all features:
```bash
pip install crawl4ai[all]
```
After installation, run the following command to install Playwright dependencies:
```bash
playwright install
```
If you've installed the "torch", "transformer", or "all" options, it's recommended to run:
```bash
crawl4ai-download-models
git clone https://github.com/unclecode/crawl4ai.git
cd crawl4ai
pip install -e .
```
### Using Docker 🐳
```bash
# For Mac users (M1/M2)
docker build --platform linux/amd64 -t crawl4ai .
# For other users
docker build -t crawl4ai .
docker run -d -p 8000:80 crawl4ai
```
### Using Docker Hub 🐳
```bash
docker pull unclecode/crawl4ai:latest
docker run -d -p 8000:80 unclecode/crawl4ai:latest
```
We're in the process of creating Docker images and pushing them to Docker Hub. This will provide an easy way to run Crawl4AI in a containerized environment. Stay tuned for updates!
For more detailed installation instructions and options, please refer to our [Installation Guide](https://crawl4ai.com/mkdocs/installation).

View File

@ -1,11 +1,30 @@
from .web_crawler import WebCrawler
# __init__.py
from .async_webcrawler import AsyncWebCrawler
from .models import CrawlResult
__version__ = "0.3.0"
__all__ = [
"WebCrawler",
"AsyncWebCrawler",
"CrawlResult",
]
def is_sync_version_installed():
try:
import selenium
return True
except ImportError:
return False
if is_sync_version_installed():
try:
from .web_crawler import WebCrawler
__all__.append("WebCrawler")
except ImportError:
import warnings
print("Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies.")
else:
WebCrawler = None
import warnings
print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")

2
setup.cfg Normal file
View File

@ -0,0 +1,2 @@
[options]
include_package_data = True

View File

@ -1,4 +1,5 @@
from setuptools import setup, find_packages
from setuptools.command.install import install
import os
from pathlib import Path
import shutil
@ -16,7 +17,8 @@ crawl4ai_folder.mkdir(exist_ok=True)
cache_folder.mkdir(exist_ok=True)
# Read the requirements from requirements.txt
with open("requirements.txt") as f:
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
with open(os.path.join(__location__, "requirements.txt")) as f:
requirements = f.read().splitlines()
# Read version from __init__.py
@ -43,6 +45,11 @@ def post_install():
except FileNotFoundError:
print("Playwright not found. Please ensure it's installed and run 'playwright install' manually.")
class PostInstallCommand(install):
def run(self):
install.run(self)
post_install()
setup(
name="Crawl4AI",
version=version,
@ -79,6 +86,6 @@ setup(
],
python_requires=">=3.7",
cmdclass={
'install': post_install,
'install': PostInstallCommand,
},
)