2024-05-09 19:10:25 +08:00
|
|
|
from setuptools import setup, find_packages
|
2024-09-25 16:35:14 +08:00
|
|
|
from setuptools.command.install import install
|
2024-05-17 22:11:00 +08:00
|
|
|
import os
|
2024-06-19 18:32:20 +08:00
|
|
|
from pathlib import Path
|
2024-07-09 13:25:00 +08:00
|
|
|
import shutil
|
2024-09-24 20:52:08 +08:00
|
|
|
import subprocess
|
2024-09-29 17:07:06 +08:00
|
|
|
import sys
|
2024-11-16 14:54:41 +08:00
|
|
|
import asyncio
|
2024-05-09 19:10:25 +08:00
|
|
|
|
2024-11-16 14:54:41 +08:00
|
|
|
# Create the .crawl4ai folder structure
|
2024-07-09 13:25:00 +08:00
|
|
|
crawl4ai_folder = Path.home() / ".crawl4ai"
|
|
|
|
cache_folder = crawl4ai_folder / "cache"
|
2024-11-16 14:54:41 +08:00
|
|
|
content_folders = ['html_content', 'cleaned_html', 'markdown_content',
|
|
|
|
'extracted_content', 'screenshots']
|
2024-06-19 18:32:20 +08:00
|
|
|
|
2024-11-16 14:54:41 +08:00
|
|
|
# Clean up old cache if exists
|
2024-07-09 13:25:00 +08:00
|
|
|
if cache_folder.exists():
|
|
|
|
shutil.rmtree(cache_folder)
|
2024-07-08 16:33:25 +08:00
|
|
|
|
2024-11-16 14:54:41 +08:00
|
|
|
# Create new folder structure
|
2024-07-09 13:25:00 +08:00
|
|
|
crawl4ai_folder.mkdir(exist_ok=True)
|
|
|
|
cache_folder.mkdir(exist_ok=True)
|
2024-11-16 14:54:41 +08:00
|
|
|
for folder in content_folders:
|
|
|
|
(crawl4ai_folder / folder).mkdir(exist_ok=True)
|
2024-07-08 16:33:25 +08:00
|
|
|
|
2024-11-16 14:54:41 +08:00
|
|
|
# Read requirements and version
|
2024-09-25 16:35:14 +08:00
|
|
|
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
|
|
|
with open(os.path.join(__location__, "requirements.txt")) as f:
|
2024-05-17 15:21:45 +08:00
|
|
|
requirements = f.read().splitlines()
|
2024-09-24 20:52:08 +08:00
|
|
|
|
2024-11-04 16:51:59 +08:00
|
|
|
with open("crawl4ai/_version.py") as f:
|
2024-09-24 20:52:08 +08:00
|
|
|
for line in f:
|
|
|
|
if line.startswith("__version__"):
|
|
|
|
version = line.split("=")[1].strip().strip('"')
|
|
|
|
break
|
2024-05-17 15:52:39 +08:00
|
|
|
|
2024-11-16 14:54:41 +08:00
|
|
|
# Define requirements
|
2024-09-26 15:09:49 +08:00
|
|
|
default_requirements = requirements
|
2024-11-05 20:04:18 +08:00
|
|
|
torch_requirements = ["torch", "nltk", "scikit-learn"]
|
|
|
|
transformer_requirements = ["transformers", "tokenizers"]
|
|
|
|
cosine_similarity_requirements = ["torch", "transformers", "nltk" ]
|
2024-09-26 15:09:49 +08:00
|
|
|
sync_requirements = ["selenium"]
|
2024-09-24 20:52:08 +08:00
|
|
|
|
2024-09-29 17:07:06 +08:00
|
|
|
def install_playwright():
|
|
|
|
print("Installing Playwright browsers...")
|
2024-09-24 20:52:08 +08:00
|
|
|
try:
|
2024-09-29 17:07:06 +08:00
|
|
|
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
|
2024-09-24 20:52:08 +08:00
|
|
|
print("Playwright installation completed successfully.")
|
2024-09-29 17:07:06 +08:00
|
|
|
except subprocess.CalledProcessError as e:
|
|
|
|
print(f"Error during Playwright installation: {e}")
|
|
|
|
print("Please run 'python -m playwright install' manually after the installation.")
|
|
|
|
except Exception as e:
|
|
|
|
print(f"Unexpected error during Playwright installation: {e}")
|
|
|
|
print("Please run 'python -m playwright install' manually after the installation.")
|
2024-05-17 15:52:39 +08:00
|
|
|
|
2024-11-16 14:54:41 +08:00
|
|
|
def run_migration():
|
|
|
|
"""Initialize database during installation"""
|
|
|
|
try:
|
|
|
|
print("Starting database initialization...")
|
|
|
|
from crawl4ai.async_database import async_db_manager
|
|
|
|
asyncio.run(async_db_manager.initialize())
|
|
|
|
print("Database initialization completed successfully.")
|
|
|
|
except ImportError:
|
|
|
|
print("Warning: Database module not found. Will initialize on first use.")
|
|
|
|
except Exception as e:
|
|
|
|
print(f"Warning: Database initialization failed: {e}")
|
|
|
|
print("Database will be initialized on first use")
|
|
|
|
|
2024-09-25 16:35:14 +08:00
|
|
|
class PostInstallCommand(install):
|
|
|
|
def run(self):
|
|
|
|
install.run(self)
|
2024-09-29 17:07:06 +08:00
|
|
|
install_playwright()
|
2024-11-16 14:54:41 +08:00
|
|
|
run_migration()
|
2024-09-29 17:07:06 +08:00
|
|
|
|
2024-05-09 19:10:25 +08:00
|
|
|
setup(
|
|
|
|
name="Crawl4AI",
|
2024-09-24 20:52:08 +08:00
|
|
|
version=version,
|
2024-09-12 16:50:52 +08:00
|
|
|
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
|
2024-06-30 20:27:33 +05:30
|
|
|
long_description=open("README.md", encoding="utf-8").read(),
|
2024-05-09 19:10:25 +08:00
|
|
|
long_description_content_type="text/markdown",
|
|
|
|
url="https://github.com/unclecode/crawl4ai",
|
|
|
|
author="Unclecode",
|
|
|
|
author_email="unclecode@kidocode.com",
|
|
|
|
license="MIT",
|
|
|
|
packages=find_packages(),
|
2024-11-16 14:54:41 +08:00
|
|
|
install_requires=default_requirements + ["playwright", "aiofiles"], # Added aiofiles
|
2024-05-17 15:52:39 +08:00
|
|
|
extras_require={
|
2024-06-30 00:15:29 +08:00
|
|
|
"torch": torch_requirements,
|
|
|
|
"transformer": transformer_requirements,
|
2024-09-24 20:52:08 +08:00
|
|
|
"cosine": cosine_similarity_requirements,
|
2024-09-26 15:09:49 +08:00
|
|
|
"sync": sync_requirements,
|
|
|
|
"all": default_requirements + torch_requirements + transformer_requirements + cosine_similarity_requirements + sync_requirements,
|
2024-05-17 15:52:39 +08:00
|
|
|
},
|
2024-05-16 20:08:21 +08:00
|
|
|
entry_points={
|
|
|
|
'console_scripts': [
|
|
|
|
'crawl4ai-download-models=crawl4ai.model_loader:main',
|
2024-11-16 14:54:41 +08:00
|
|
|
'crawl4ai-migrate=crawl4ai.migrations:main', # Added migration command
|
2024-05-16 20:08:21 +08:00
|
|
|
],
|
|
|
|
},
|
2024-05-09 19:10:25 +08:00
|
|
|
classifiers=[
|
|
|
|
"Development Status :: 3 - Alpha",
|
|
|
|
"Intended Audience :: Developers",
|
|
|
|
"License :: OSI Approved :: Apache Software License",
|
|
|
|
"Programming Language :: Python :: 3",
|
|
|
|
"Programming Language :: Python :: 3.7",
|
|
|
|
"Programming Language :: Python :: 3.8",
|
|
|
|
"Programming Language :: Python :: 3.9",
|
|
|
|
"Programming Language :: Python :: 3.10",
|
|
|
|
],
|
|
|
|
python_requires=">=3.7",
|
2024-09-24 20:52:08 +08:00
|
|
|
cmdclass={
|
2024-09-25 16:35:14 +08:00
|
|
|
'install': PostInstallCommand,
|
2024-09-24 20:52:08 +08:00
|
|
|
},
|
2024-06-30 00:15:29 +08:00
|
|
|
)
|