crawl4ai/setup.py
UncleCode d0014c6793 New async database manager and migration support
- Introduced AsyncDatabaseManager for async DB management.
  - Added migration feature to transition to file-based storage.
  - Enhanced web crawler with improved caching logic.
  - Updated requirements and setup for async processing.
2024-11-16 14:54:41 +08:00

114 lines
4.2 KiB
Python

from setuptools import setup, find_packages
from setuptools.command.install import install
import os
from pathlib import Path
import shutil
import subprocess
import sys
import asyncio
# Create the .crawl4ai folder structure
crawl4ai_folder = Path.home() / ".crawl4ai"
cache_folder = crawl4ai_folder / "cache"
content_folders = ['html_content', 'cleaned_html', 'markdown_content',
'extracted_content', 'screenshots']
# Clean up old cache if exists
if cache_folder.exists():
shutil.rmtree(cache_folder)
# Create new folder structure
crawl4ai_folder.mkdir(exist_ok=True)
cache_folder.mkdir(exist_ok=True)
for folder in content_folders:
(crawl4ai_folder / folder).mkdir(exist_ok=True)
# Read requirements and version
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
with open(os.path.join(__location__, "requirements.txt")) as f:
requirements = f.read().splitlines()
with open("crawl4ai/_version.py") as f:
for line in f:
if line.startswith("__version__"):
version = line.split("=")[1].strip().strip('"')
break
# Define requirements
default_requirements = requirements
torch_requirements = ["torch", "nltk", "scikit-learn"]
transformer_requirements = ["transformers", "tokenizers"]
cosine_similarity_requirements = ["torch", "transformers", "nltk" ]
sync_requirements = ["selenium"]
def install_playwright():
print("Installing Playwright browsers...")
try:
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
print("Playwright installation completed successfully.")
except subprocess.CalledProcessError as e:
print(f"Error during Playwright installation: {e}")
print("Please run 'python -m playwright install' manually after the installation.")
except Exception as e:
print(f"Unexpected error during Playwright installation: {e}")
print("Please run 'python -m playwright install' manually after the installation.")
def run_migration():
"""Initialize database during installation"""
try:
print("Starting database initialization...")
from crawl4ai.async_database import async_db_manager
asyncio.run(async_db_manager.initialize())
print("Database initialization completed successfully.")
except ImportError:
print("Warning: Database module not found. Will initialize on first use.")
except Exception as e:
print(f"Warning: Database initialization failed: {e}")
print("Database will be initialized on first use")
class PostInstallCommand(install):
def run(self):
install.run(self)
install_playwright()
run_migration()
setup(
name="Crawl4AI",
version=version,
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
long_description=open("README.md", encoding="utf-8").read(),
long_description_content_type="text/markdown",
url="https://github.com/unclecode/crawl4ai",
author="Unclecode",
author_email="unclecode@kidocode.com",
license="MIT",
packages=find_packages(),
install_requires=default_requirements + ["playwright", "aiofiles"], # Added aiofiles
extras_require={
"torch": torch_requirements,
"transformer": transformer_requirements,
"cosine": cosine_similarity_requirements,
"sync": sync_requirements,
"all": default_requirements + torch_requirements + transformer_requirements + cosine_similarity_requirements + sync_requirements,
},
entry_points={
'console_scripts': [
'crawl4ai-download-models=crawl4ai.model_loader:main',
'crawl4ai-migrate=crawl4ai.migrations:main', # Added migration command
],
},
classifiers=[
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
],
python_requires=">=3.7",
cmdclass={
'install': PostInstallCommand,
},
)