mirror of
https://github.com/unclecode/crawl4ai.git
synced 2025-11-02 21:12:54 +00:00
refactor: remove legacy build hooks and setup files, migrate to setup.cfg and pyproject.toml
This commit is contained in:
parent
449dd7cc0b
commit
12e73d4898
1
MANIFEST.in
Normal file
1
MANIFEST.in
Normal file
@ -0,0 +1 @@
|
|||||||
|
include requirements.txt
|
||||||
@ -1,48 +0,0 @@
|
|||||||
import os
|
|
||||||
import shutil
|
|
||||||
from pathlib import Path
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
from hatchling.builders.hooks.plugin.interface import BuildHookInterface
|
|
||||||
PLUGIN = "CustomBuildHook"
|
|
||||||
|
|
||||||
class CustomBuildHook(BuildHookInterface):
|
|
||||||
def initialize(self, version, build_data):
|
|
||||||
# Create the .crawl4ai folder structure
|
|
||||||
base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
|
|
||||||
crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
|
|
||||||
crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
|
|
||||||
cache_folder = crawl4ai_folder / "cache"
|
|
||||||
content_folders = [
|
|
||||||
"html_content",
|
|
||||||
"cleaned_html",
|
|
||||||
"markdown_content",
|
|
||||||
"extracted_content",
|
|
||||||
"screenshots",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Clean up old cache if exists
|
|
||||||
if cache_folder.exists():
|
|
||||||
shutil.rmtree(cache_folder)
|
|
||||||
|
|
||||||
# Create new folder structure
|
|
||||||
crawl4ai_folder.mkdir(exist_ok=True)
|
|
||||||
cache_folder.mkdir(exist_ok=True)
|
|
||||||
for folder in content_folders:
|
|
||||||
(crawl4ai_folder / folder).mkdir(exist_ok=True)
|
|
||||||
|
|
||||||
# Install Playwright browsers
|
|
||||||
try:
|
|
||||||
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Warning: Playwright installation failed: {e}")
|
|
||||||
print("Please run 'python -m playwright install' manually after installation")
|
|
||||||
|
|
||||||
# Initialize database
|
|
||||||
try:
|
|
||||||
from crawl4ai.async_database import async_db_manager
|
|
||||||
import asyncio
|
|
||||||
asyncio.run(async_db_manager.initialize())
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Warning: Database initialization failed: {e}")
|
|
||||||
print("Database will be initialized on first use")
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
from colorama import Fore, Style
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
|
|
||||||
def post_install():
|
|
||||||
print(f"\n{Fore.YELLOW}{'='*40}")
|
|
||||||
print(f"{Fore.RED}IMPORTANT: Run this command now:")
|
|
||||||
print(f"{Fore.GREEN}python -m playwright install")
|
|
||||||
print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n")
|
|
||||||
@ -1,19 +0,0 @@
|
|||||||
from colorama import Fore, Style
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
import distutils.log as log
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
def main():
|
|
||||||
try:
|
|
||||||
subprocess.check_call([sys.executable, "-m", "playwright", "install"],
|
|
||||||
stdout=subprocess.DEVNULL,
|
|
||||||
stderr=subprocess.DEVNULL)
|
|
||||||
except:
|
|
||||||
print(f"\n{Fore.YELLOW}{'='*40}")
|
|
||||||
print(f"{Fore.RED}IMPORTANT: Run this command now:")
|
|
||||||
print(f"{Fore.GREEN}python -m playwright install")
|
|
||||||
print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@ -1,75 +0,0 @@
|
|||||||
[build-system]
|
|
||||||
requires = ["hatchling", "hatch-fancy-pypi-readme>=22.5.0"]
|
|
||||||
build-backend = "hatchling.build"
|
|
||||||
|
|
||||||
[project]
|
|
||||||
name = "Crawl4AI"
|
|
||||||
dynamic = ["version"]
|
|
||||||
description = "🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
|
|
||||||
readme = "README.md"
|
|
||||||
license = "Apache-2.0"
|
|
||||||
requires-python = ">=3.7"
|
|
||||||
authors = [
|
|
||||||
{ name = "Unclecode", email = "unclecode@kidocode.com" },
|
|
||||||
]
|
|
||||||
classifiers = [
|
|
||||||
"Development Status :: 3 - Alpha",
|
|
||||||
"Intended Audience :: Developers",
|
|
||||||
"License :: OSI Approved :: Apache Software License",
|
|
||||||
"Programming Language :: Python :: 3",
|
|
||||||
"Programming Language :: Python :: 3.7",
|
|
||||||
"Programming Language :: Python :: 3.8",
|
|
||||||
"Programming Language :: Python :: 3.9",
|
|
||||||
"Programming Language :: Python :: 3.10",
|
|
||||||
]
|
|
||||||
dependencies = [
|
|
||||||
"aiosqlite~=0.20",
|
|
||||||
"html2text~=2024.2",
|
|
||||||
"lxml~=5.3",
|
|
||||||
"litellm>=1.53.1",
|
|
||||||
"numpy>=1.26.0,<3",
|
|
||||||
"pillow~=10.4",
|
|
||||||
"playwright>=1.49.0",
|
|
||||||
"python-dotenv~=1.0",
|
|
||||||
"requests~=2.26",
|
|
||||||
"beautifulsoup4~=4.12",
|
|
||||||
"tf-playwright-stealth>=1.1.0",
|
|
||||||
"xxhash~=3.4",
|
|
||||||
"rank-bm25~=0.2",
|
|
||||||
"aiofiles>=24.1.0",
|
|
||||||
"colorama~=0.4",
|
|
||||||
"snowballstemmer~=2.2",
|
|
||||||
]
|
|
||||||
|
|
||||||
[project.optional-dependencies]
|
|
||||||
torch = ["torch", "nltk", "scikit-learn"]
|
|
||||||
transformer = ["transformers", "tokenizers"]
|
|
||||||
cosine = ["torch", "transformers", "nltk"]
|
|
||||||
sync = ["selenium"]
|
|
||||||
all = [
|
|
||||||
"torch",
|
|
||||||
"nltk",
|
|
||||||
"scikit-learn",
|
|
||||||
"transformers",
|
|
||||||
"tokenizers",
|
|
||||||
"selenium",
|
|
||||||
]
|
|
||||||
|
|
||||||
[project.urls]
|
|
||||||
Homepage = "https://github.com/unclecode/crawl4ai"
|
|
||||||
Documentation = "https://crawl4ai.com/mkdocs/"
|
|
||||||
|
|
||||||
[project.scripts]
|
|
||||||
crawl4ai-download-models = "crawl4ai.model_loader:main"
|
|
||||||
crawl4ai-migrate = "crawl4ai.migrations:main"
|
|
||||||
crawl4ai-post-install = "crawl4ai.post_install:main"
|
|
||||||
|
|
||||||
[tool.hatch.version]
|
|
||||||
path = "crawl4ai/__version__.py"
|
|
||||||
|
|
||||||
[tool.hatch.build.hooks.custom]
|
|
||||||
dependencies = ["hatch-fancy-pypi-readme>=22.5.0"]
|
|
||||||
path = "build_hooks.py"
|
|
||||||
|
|
||||||
[project.entry-points.hatch]
|
|
||||||
crawl4ai = "crawl4ai.plugin:post_install"
|
|
||||||
16
requirements.txt
Normal file
16
requirements.txt
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
aiosqlite~=0.20
|
||||||
|
html2text~=2024.2
|
||||||
|
lxml~=5.3
|
||||||
|
litellm>=1.53.1
|
||||||
|
numpy>=1.26.0,<3
|
||||||
|
pillow~=10.4
|
||||||
|
playwright>=1.49.0
|
||||||
|
python-dotenv~=1.0
|
||||||
|
requests~=2.26
|
||||||
|
beautifulsoup4~=4.12
|
||||||
|
tf-playwright-stealth>=1.1.0
|
||||||
|
xxhash~=3.4
|
||||||
|
rank-bm25~=0.2
|
||||||
|
aiofiles>=24.1.0
|
||||||
|
colorama~=0.4
|
||||||
|
snowballstemmer~=2.2
|
||||||
136
setup.py
Normal file
136
setup.py
Normal file
@ -0,0 +1,136 @@
|
|||||||
|
from setuptools import setup, find_packages
|
||||||
|
from setuptools.command.install import install
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||||
|
# If the folder already exists, remove the cache folder
|
||||||
|
base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
|
||||||
|
crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
|
||||||
|
crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
|
||||||
|
cache_folder = crawl4ai_folder / "cache"
|
||||||
|
content_folders = [
|
||||||
|
"html_content",
|
||||||
|
"cleaned_html",
|
||||||
|
"markdown_content",
|
||||||
|
"extracted_content",
|
||||||
|
"screenshots",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Clean up old cache if exists
|
||||||
|
if cache_folder.exists():
|
||||||
|
shutil.rmtree(cache_folder)
|
||||||
|
|
||||||
|
# Create new folder structure
|
||||||
|
crawl4ai_folder.mkdir(exist_ok=True)
|
||||||
|
cache_folder.mkdir(exist_ok=True)
|
||||||
|
for folder in content_folders:
|
||||||
|
(crawl4ai_folder / folder).mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# Read requirements and version
|
||||||
|
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
|
with open(os.path.join(__location__, "requirements.txt")) as f:
|
||||||
|
requirements = f.read().splitlines()
|
||||||
|
|
||||||
|
with open("crawl4ai/__version__.py") as f:
|
||||||
|
for line in f:
|
||||||
|
if line.startswith("__version__"):
|
||||||
|
version = line.split("=")[1].strip().strip('"')
|
||||||
|
break
|
||||||
|
|
||||||
|
# Define requirements
|
||||||
|
default_requirements = requirements
|
||||||
|
torch_requirements = ["torch", "nltk", "scikit-learn"]
|
||||||
|
transformer_requirements = ["transformers", "tokenizers"]
|
||||||
|
cosine_similarity_requirements = ["torch", "transformers", "nltk"]
|
||||||
|
sync_requirements = ["selenium"]
|
||||||
|
|
||||||
|
|
||||||
|
def install_playwright():
|
||||||
|
print("Installing Playwright browsers...")
|
||||||
|
try:
|
||||||
|
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
|
||||||
|
print("Playwright installation completed successfully.")
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"Error during Playwright installation: {e}")
|
||||||
|
print(
|
||||||
|
"Please run 'python -m playwright install' manually after the installation."
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Unexpected error during Playwright installation: {e}")
|
||||||
|
print(
|
||||||
|
"Please run 'python -m playwright install' manually after the installation."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_migration():
|
||||||
|
"""Initialize database during installation"""
|
||||||
|
try:
|
||||||
|
print("Starting database initialization...")
|
||||||
|
from crawl4ai.async_database import async_db_manager
|
||||||
|
|
||||||
|
asyncio.run(async_db_manager.initialize())
|
||||||
|
print("Database initialization completed successfully.")
|
||||||
|
except ImportError:
|
||||||
|
print("Warning: Database module not found. Will initialize on first use.")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: Database initialization failed: {e}")
|
||||||
|
print("Database will be initialized on first use")
|
||||||
|
|
||||||
|
|
||||||
|
class PostInstallCommand(install):
|
||||||
|
def run(self):
|
||||||
|
install.run(self)
|
||||||
|
install_playwright()
|
||||||
|
# run_migration()
|
||||||
|
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="Crawl4AI",
|
||||||
|
version=version,
|
||||||
|
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
|
||||||
|
long_description=open("README.md", encoding="utf-8").read(),
|
||||||
|
long_description_content_type="text/markdown",
|
||||||
|
url="https://github.com/unclecode/crawl4ai",
|
||||||
|
author="Unclecode",
|
||||||
|
author_email="unclecode@kidocode.com",
|
||||||
|
license="MIT",
|
||||||
|
packages=find_packages(),
|
||||||
|
install_requires=default_requirements
|
||||||
|
+ ["playwright", "aiofiles"], # Added aiofiles
|
||||||
|
extras_require={
|
||||||
|
"torch": torch_requirements,
|
||||||
|
"transformer": transformer_requirements,
|
||||||
|
"cosine": cosine_similarity_requirements,
|
||||||
|
"sync": sync_requirements,
|
||||||
|
"all": default_requirements
|
||||||
|
+ torch_requirements
|
||||||
|
+ transformer_requirements
|
||||||
|
+ cosine_similarity_requirements
|
||||||
|
+ sync_requirements,
|
||||||
|
},
|
||||||
|
entry_points={
|
||||||
|
"console_scripts": [
|
||||||
|
"crawl4ai-download-models=crawl4ai.model_loader:main",
|
||||||
|
"crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command
|
||||||
|
],
|
||||||
|
},
|
||||||
|
classifiers=[
|
||||||
|
"Development Status :: 3 - Alpha",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"License :: OSI Approved :: Apache Software License",
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"Programming Language :: Python :: 3.7",
|
||||||
|
"Programming Language :: Python :: 3.8",
|
||||||
|
"Programming Language :: Python :: 3.9",
|
||||||
|
"Programming Language :: Python :: 3.10",
|
||||||
|
],
|
||||||
|
python_requires=">=3.7",
|
||||||
|
cmdclass={
|
||||||
|
"install": PostInstallCommand,
|
||||||
|
},
|
||||||
|
)
|
||||||
Loading…
x
Reference in New Issue
Block a user