From 12e73d489846dc83c29347bf84646ad8daef6cfc Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 16:01:19 +0800 Subject: [PATCH] refactor: remove legacy build hooks and setup files, migrate to setup.cfg and pyproject.toml --- MANIFEST.in | 1 + build_hooks.py | 48 ----------------- plugin.py | 9 ---- post_install.py | 19 ------- pyproject.toml | 75 -------------------------- requirements.txt | 16 ++++++ setup.cfg | 2 + setup.py | 136 +++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 155 insertions(+), 151 deletions(-) create mode 100644 MANIFEST.in delete mode 100644 build_hooks.py delete mode 100644 plugin.py delete mode 100644 post_install.py delete mode 100644 pyproject.toml create mode 100644 requirements.txt create mode 100644 setup.cfg create mode 100644 setup.py diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..540b720 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include requirements.txt \ No newline at end of file diff --git a/build_hooks.py b/build_hooks.py deleted file mode 100644 index e59b591..0000000 --- a/build_hooks.py +++ /dev/null @@ -1,48 +0,0 @@ -import os -import shutil -from pathlib import Path -import subprocess -import sys -from hatchling.builders.hooks.plugin.interface import BuildHookInterface -PLUGIN = "CustomBuildHook" - -class CustomBuildHook(BuildHookInterface): - def initialize(self, version, build_data): - # Create the .crawl4ai folder structure - base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY") - crawl4ai_folder = Path(base_dir) if base_dir else Path.home() - crawl4ai_folder = crawl4ai_folder / ".crawl4ai" - cache_folder = crawl4ai_folder / "cache" - content_folders = [ - "html_content", - "cleaned_html", - "markdown_content", - "extracted_content", - "screenshots", - ] - - # Clean up old cache if exists - if cache_folder.exists(): - shutil.rmtree(cache_folder) - - # Create new folder structure - crawl4ai_folder.mkdir(exist_ok=True) - cache_folder.mkdir(exist_ok=True) - for folder in content_folders: - (crawl4ai_folder / folder).mkdir(exist_ok=True) - - # Install Playwright browsers - try: - subprocess.check_call([sys.executable, "-m", "playwright", "install"]) - except Exception as e: - print(f"Warning: Playwright installation failed: {e}") - print("Please run 'python -m playwright install' manually after installation") - - # Initialize database - try: - from crawl4ai.async_database import async_db_manager - import asyncio - asyncio.run(async_db_manager.initialize()) - except Exception as e: - print(f"Warning: Database initialization failed: {e}") - print("Database will be initialized on first use") \ No newline at end of file diff --git a/plugin.py b/plugin.py deleted file mode 100644 index 1e1b11b..0000000 --- a/plugin.py +++ /dev/null @@ -1,9 +0,0 @@ -from colorama import Fore, Style -import subprocess -import sys - -def post_install(): - print(f"\n{Fore.YELLOW}{'='*40}") - print(f"{Fore.RED}IMPORTANT: Run this command now:") - print(f"{Fore.GREEN}python -m playwright install") - print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n") \ No newline at end of file diff --git a/post_install.py b/post_install.py deleted file mode 100644 index e536e54..0000000 --- a/post_install.py +++ /dev/null @@ -1,19 +0,0 @@ -from colorama import Fore, Style -import subprocess -import sys -import distutils.log as log -from pathlib import Path - -def main(): - try: - subprocess.check_call([sys.executable, "-m", "playwright", "install"], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL) - except: - print(f"\n{Fore.YELLOW}{'='*40}") - print(f"{Fore.RED}IMPORTANT: Run this command now:") - print(f"{Fore.GREEN}python -m playwright install") - print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n") - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index cfef810..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,75 +0,0 @@ -[build-system] -requires = ["hatchling", "hatch-fancy-pypi-readme>=22.5.0"] -build-backend = "hatchling.build" - -[project] -name = "Crawl4AI" -dynamic = ["version"] -description = "🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" -readme = "README.md" -license = "Apache-2.0" -requires-python = ">=3.7" -authors = [ - { name = "Unclecode", email = "unclecode@kidocode.com" }, -] -classifiers = [ - "Development Status :: 3 - Alpha", - "Intended Audience :: Developers", - "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", -] -dependencies = [ - "aiosqlite~=0.20", - "html2text~=2024.2", - "lxml~=5.3", - "litellm>=1.53.1", - "numpy>=1.26.0,<3", - "pillow~=10.4", - "playwright>=1.49.0", - "python-dotenv~=1.0", - "requests~=2.26", - "beautifulsoup4~=4.12", - "tf-playwright-stealth>=1.1.0", - "xxhash~=3.4", - "rank-bm25~=0.2", - "aiofiles>=24.1.0", - "colorama~=0.4", - "snowballstemmer~=2.2", -] - -[project.optional-dependencies] -torch = ["torch", "nltk", "scikit-learn"] -transformer = ["transformers", "tokenizers"] -cosine = ["torch", "transformers", "nltk"] -sync = ["selenium"] -all = [ - "torch", - "nltk", - "scikit-learn", - "transformers", - "tokenizers", - "selenium", -] - -[project.urls] -Homepage = "https://github.com/unclecode/crawl4ai" -Documentation = "https://crawl4ai.com/mkdocs/" - -[project.scripts] -crawl4ai-download-models = "crawl4ai.model_loader:main" -crawl4ai-migrate = "crawl4ai.migrations:main" -crawl4ai-post-install = "crawl4ai.post_install:main" - -[tool.hatch.version] -path = "crawl4ai/__version__.py" - -[tool.hatch.build.hooks.custom] -dependencies = ["hatch-fancy-pypi-readme>=22.5.0"] -path = "build_hooks.py" - -[project.entry-points.hatch] -crawl4ai = "crawl4ai.plugin:post_install" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c0f6f18 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,16 @@ +aiosqlite~=0.20 +html2text~=2024.2 +lxml~=5.3 +litellm>=1.53.1 +numpy>=1.26.0,<3 +pillow~=10.4 +playwright>=1.49.0 +python-dotenv~=1.0 +requests~=2.26 +beautifulsoup4~=4.12 +tf-playwright-stealth>=1.1.0 +xxhash~=3.4 +rank-bm25~=0.2 +aiofiles>=24.1.0 +colorama~=0.4 +snowballstemmer~=2.2 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..56490d6 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[options] +include_package_data = True \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..d44169b --- /dev/null +++ b/setup.py @@ -0,0 +1,136 @@ +from setuptools import setup, find_packages +from setuptools.command.install import install +import os +from pathlib import Path +import shutil +import subprocess +import sys +import asyncio + +# Create the .crawl4ai folder in the user's home directory if it doesn't exist +# If the folder already exists, remove the cache folder +base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY") +crawl4ai_folder = Path(base_dir) if base_dir else Path.home() +crawl4ai_folder = crawl4ai_folder / ".crawl4ai" +cache_folder = crawl4ai_folder / "cache" +content_folders = [ + "html_content", + "cleaned_html", + "markdown_content", + "extracted_content", + "screenshots", +] + +# Clean up old cache if exists +if cache_folder.exists(): + shutil.rmtree(cache_folder) + +# Create new folder structure +crawl4ai_folder.mkdir(exist_ok=True) +cache_folder.mkdir(exist_ok=True) +for folder in content_folders: + (crawl4ai_folder / folder).mkdir(exist_ok=True) + +# Read requirements and version +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) +with open(os.path.join(__location__, "requirements.txt")) as f: + requirements = f.read().splitlines() + +with open("crawl4ai/__version__.py") as f: + for line in f: + if line.startswith("__version__"): + version = line.split("=")[1].strip().strip('"') + break + +# Define requirements +default_requirements = requirements +torch_requirements = ["torch", "nltk", "scikit-learn"] +transformer_requirements = ["transformers", "tokenizers"] +cosine_similarity_requirements = ["torch", "transformers", "nltk"] +sync_requirements = ["selenium"] + + +def install_playwright(): + print("Installing Playwright browsers...") + try: + subprocess.check_call([sys.executable, "-m", "playwright", "install"]) + print("Playwright installation completed successfully.") + except subprocess.CalledProcessError as e: + print(f"Error during Playwright installation: {e}") + print( + "Please run 'python -m playwright install' manually after the installation." + ) + except Exception as e: + print(f"Unexpected error during Playwright installation: {e}") + print( + "Please run 'python -m playwright install' manually after the installation." + ) + + +def run_migration(): + """Initialize database during installation""" + try: + print("Starting database initialization...") + from crawl4ai.async_database import async_db_manager + + asyncio.run(async_db_manager.initialize()) + print("Database initialization completed successfully.") + except ImportError: + print("Warning: Database module not found. Will initialize on first use.") + except Exception as e: + print(f"Warning: Database initialization failed: {e}") + print("Database will be initialized on first use") + + +class PostInstallCommand(install): + def run(self): + install.run(self) + install_playwright() + # run_migration() + + +setup( + name="Crawl4AI", + version=version, + description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper", + long_description=open("README.md", encoding="utf-8").read(), + long_description_content_type="text/markdown", + url="https://github.com/unclecode/crawl4ai", + author="Unclecode", + author_email="unclecode@kidocode.com", + license="MIT", + packages=find_packages(), + install_requires=default_requirements + + ["playwright", "aiofiles"], # Added aiofiles + extras_require={ + "torch": torch_requirements, + "transformer": transformer_requirements, + "cosine": cosine_similarity_requirements, + "sync": sync_requirements, + "all": default_requirements + + torch_requirements + + transformer_requirements + + cosine_similarity_requirements + + sync_requirements, + }, + entry_points={ + "console_scripts": [ + "crawl4ai-download-models=crawl4ai.model_loader:main", + "crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command + ], + }, + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + ], + python_requires=">=3.7", + cmdclass={ + "install": PostInstallCommand, + }, +)