Update README, add manifest, make selenium optional library

2024-09-25 16:35:14 +08:00 · 2024-09-25 16:35:14 +08:00 · f1eee09cf4
commit f1eee09cf4
parent 4d48bd31ca
5 changed files with 41 additions and 57 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1 @@
+include requirements.txt
--- a/README.md
+++ b/README.md
@ -16,8 +16,6 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc

 ✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/)

-✨ Check out the [Demo](https://crawl4ai.com/mkdocs/demo)
-
 ## Features ✨

 - 🆓 Completely free and open-source
@ -57,21 +55,9 @@ For basic web crawling and scraping tasks:
 pip install crawl4ai
 ```

-#### Installation with PyTorch
+By default this will install the asynchronous version of Crawl4AI, using Playwright for web crawling.

-For advanced text clustering (includes CosineSimilarity cluster strategy):
-
-```bash
-pip install crawl4ai[torch]
-```
-
-#### Installation with Transformers
-
-For text summarization and Hugging Face models:
-
-```bash
-pip install crawl4ai[transformer]
-```
+    👉 Note: The standard version of Crawl4AI uses Playwright for asynchronous crawling. If you encounter an error saying that Playwright is not installed, you can run playwright install. However, this should be done automatically during the setup process.

 #### Installation with Synchronous Version

@ -81,50 +67,19 @@ If you need the synchronous version using Selenium:
 pip install crawl4ai[sync]
 ```

-#### Installation with Cosine Similarity
+#### Development Installation

-For using the cosine similarity strategy:
+For contributors who plan to modify the source code:

 ```bash
-pip install crawl4ai[cosine]
-```
-
-#### Full Installation
-
-For all features:
-
-```bash
-pip install crawl4ai[all]
-```
-
-After installation, run the following command to install Playwright dependencies:
-
-```bash
-playwright install
-```
-
-If you've installed the "torch", "transformer", or "all" options, it's recommended to run:
-
-```bash
-crawl4ai-download-models
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai
+pip install -e .
 ```

 ### Using Docker 🐳

-```bash
-# For Mac users (M1/M2)
-docker build --platform linux/amd64 -t crawl4ai .
-# For other users
-docker build -t crawl4ai .
-docker run -d -p 8000:80 crawl4ai
-```
-
-### Using Docker Hub 🐳
-
-```bash
-docker pull unclecode/crawl4ai:latest
-docker run -d -p 8000:80 unclecode/crawl4ai:latest
-```
+We're in the process of creating Docker images and pushing them to Docker Hub. This will provide an easy way to run Crawl4AI in a containerized environment. Stay tuned for updates!

 For more detailed installation instructions and options, please refer to our [Installation Guide](https://crawl4ai.com/mkdocs/installation).

--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@ -1,11 +1,30 @@
-from .web_crawler import WebCrawler
+# __init__.py
+
 from .async_webcrawler import AsyncWebCrawler
 from .models import CrawlResult

 __version__ = "0.3.0"

 __all__ = [
-    "WebCrawler",
    "AsyncWebCrawler",
    "CrawlResult",
 ]
+
+def is_sync_version_installed():
+    try:
+        import selenium
+        return True
+    except ImportError:
+        return False
+
+if is_sync_version_installed():
+    try:
+        from .web_crawler import WebCrawler
+        __all__.append("WebCrawler")
+    except ImportError:
+        import warnings
+        print("Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies.")
+else:
+    WebCrawler = None
+    import warnings
+    print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
--- a/setup.cfg
+++ b/setup.cfg
@ -0,0 +1,2 @@
+[options]
+include_package_data = True
--- a/setup.py
+++ b/setup.py
@ -1,4 +1,5 @@
 from setuptools import setup, find_packages
+from setuptools.command.install import install
 import os
 from pathlib import Path
 import shutil
@ -16,7 +17,8 @@ crawl4ai_folder.mkdir(exist_ok=True)
 cache_folder.mkdir(exist_ok=True)

 # Read the requirements from requirements.txt
-with open("requirements.txt") as f:
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+with open(os.path.join(__location__, "requirements.txt")) as f:
    requirements = f.read().splitlines()
    
 # Read version from __init__.py
@ -43,6 +45,11 @@ def post_install():
    except FileNotFoundError:
        print("Playwright not found. Please ensure it's installed and run 'playwright install' manually.")

+class PostInstallCommand(install):
+    def run(self):
+        install.run(self)
+        post_install()
+        
 setup(
    name="Crawl4AI",
    version=version,
@ -79,6 +86,6 @@ setup(
    ],
    python_requires=">=3.7",
    cmdclass={
-        'install': post_install,
+        'install': PostInstallCommand,
    },
 )