crawl4ai/pyproject.toml

[build-system]
requires = ["setuptools>=64.0.0", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "Crawl4AI"
dynamic = ["version"]
description = "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
readme = "README.md"
requires-python = ">=3.9"
license = "Apache-2.0"
authors = [
    {name = "Unclecode", email = "unclecode@kidocode.com"}
]
dependencies = [
    "aiosqlite~=0.20",
    "lxml~=5.3",
    "litellm>=1.53.1",
    "numpy>=1.26.0,<3",
    "pillow~=10.4",
    "playwright>=1.49.0",
    "python-dotenv~=1.0",
    "requests~=2.26",
    "beautifulsoup4~=4.12",
    "tf-playwright-stealth>=1.1.0",
    "xxhash~=3.4",
    "rank-bm25~=0.2",
    "aiofiles>=24.1.0",
    "colorama~=0.4",
    "snowballstemmer~=2.2",
    "pydantic>=2.10",
    "pyOpenSSL>=24.3.0",
    "psutil>=6.1.1",
    "nltk>=3.9.1",
    "playwright",
    "aiofiles",
    "rich>=13.9.4",
    "cssselect>=1.2.0",
    "httpx>=0.27.2",
    "fake-useragent>=2.0.3",
    "click>=8.1.7",
    "pyperclip>=1.8.2",
    "chardet>=5.2.0",
    "aiohttp>=3.11.11",
    "brotli>=1.1.0",
    "humanize>=4.10.0",
]
classifiers = [
    "Development Status :: 4 - Beta",
    "Intended Audience :: Developers",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3.9",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Programming Language :: Python :: 3.13",
]

[project.optional-dependencies]
pdf = ["PyPDF2"]  
torch = ["torch", "nltk", "scikit-learn"]
transformer = ["transformers", "tokenizers"]
cosine = ["torch", "transformers", "nltk"]
sync = ["selenium"]
all = [
    "PyPDF2",
    "torch",
    "nltk",
    "scikit-learn",
    "transformers",
    "tokenizers",
    "selenium",
    "PyPDF2"  
]

[project.scripts]
crawl4ai-download-models = "crawl4ai.model_loader:main"
crawl4ai-migrate = "crawl4ai.migrations:main"
crawl4ai-setup = "crawl4ai.install:post_install"
crawl4ai-doctor = "crawl4ai.install:doctor"
crwl = "crawl4ai.cli:main"

[tool.setuptools]
packages = {find = {where = ["."], include = ["crawl4ai*"]}}

[tool.setuptools.package-data]
crawl4ai = ["js_snippet/*.js"]

[tool.setuptools.dynamic]
version = {attr = "crawl4ai.__version__.__version__"}

[tool.uv.sources]
crawl4ai = { workspace = true }

[dependency-groups]
dev = [
    "crawl4ai",
]
docs: update project description emojis - Change project description emojis from 🔥🕷️ to 🚀🤖 - Update emojis consistently in both setup.py and pyproject.toml 2025-01-01 15:39:33 +08:00			`[build-system]`
			`requires = ["setuptools>=64.0.0", "wheel"]`
			`build-backend = "setuptools.build_meta"`

			`[project]`
			`name = "Crawl4AI"`
			`dynamic = ["version"]`
			`description = "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"`
			`readme = "README.md"`
			`requires-python = ">=3.9"`
chore(deps): upgrade to Python 3.12 and prepare for 0.6.0 release - Update Docker base image to Python 3.12-slim-bookworm - Bump version from 0.6.0rc1 to 0.6.0 - Update documentation to reflect release version changes - Fix license specification in pyproject.toml and setup.py - Clean up code formatting in demo_docker_api.py BREAKING CHANGE: Base Python version upgraded from 3.10 to 3.12 2025-04-23 16:35:15 +08:00			`license = "Apache-2.0"`
docs: update project description emojis - Change project description emojis from 🔥🕷️ to 🚀🤖 - Update emojis consistently in both setup.py and pyproject.toml 2025-01-01 15:39:33 +08:00			`authors = [`
			`{name = "Unclecode", email = "unclecode@kidocode.com"}`
			`]`
build: modernize package configuration with pyproject.toml - Add pyproject.toml for PEP 517 build system support - Configure dependencies, scripts, and metadata in pyproject.toml - Set Python requirement to >=3.9 and add support up to 3.13 - Keep setup.py for backwards compatibility - Move package dependencies and entry points to pyproject.toml 2025-01-01 15:45:27 +08:00			`dependencies = [`
			`"aiosqlite~=0.20",`
			`"lxml~=5.3",`
			`"litellm>=1.53.1",`
			`"numpy>=1.26.0,<3",`
			`"pillow~=10.4",`
			`"playwright>=1.49.0",`
			`"python-dotenv~=1.0",`
			`"requests~=2.26",`
			`"beautifulsoup4~=4.12",`
			`"tf-playwright-stealth>=1.1.0",`
			`"xxhash~=3.4",`
			`"rank-bm25~=0.2",`
			`"aiofiles>=24.1.0",`
			`"colorama~=0.4",`
			`"snowballstemmer~=2.2",`
			`"pydantic>=2.10",`
			`"pyOpenSSL>=24.3.0",`
			`"psutil>=6.1.1",`
			`"nltk>=3.9.1",`
			`"playwright",`
refactor(docs): reorganize documentation structure and update styles Reorganize documentation into core/advanced/extraction sections for better navigation. Update terminal theme styles and add rich library for better CLI output. Remove redundant tutorial files and consolidate content into core sections. Add personal story to index page for project context. BREAKING CHANGE: Documentation structure has been significantly reorganized 2025-01-07 20:49:50 +08:00			`"aiofiles",`
			`"rich>=13.9.4",`
feat(scraping): add LXML-based scraping mode for improved performance Adds a new ScrapingMode enum to allow switching between BeautifulSoup and LXML parsing. LXML mode offers 10-20x better performance for large HTML documents. Key changes: - Added ScrapingMode enum with BEAUTIFULSOUP and LXML options - Implemented LXMLWebScrapingStrategy class - Added LXML-based metadata extraction - Updated documentation with scraping mode usage and performance considerations - Added cssselect dependency BREAKING CHANGE: None 2025-01-12 20:46:23 +08:00			`"cssselect>=1.2.0",`
feat(browser): add BrowserProfiler class for identity-based browsing Adds a new BrowserProfiler class that provides comprehensive management of browser profiles for identity-based crawling. Features include: - Interactive profile creation and management - Profile listing, retrieval, and deletion - Guided console interface - Migration of profile management from ManagedBrowser - New example script for identity-based browsing ALSO: - Updates logging format in AsyncWebCrawler - Removes content filter from hello_world example - Relaxes httpx version constraint BREAKING CHANGE: Profile management methods from ManagedBrowser are now deprecated and delegate to BrowserProfiler 2025-03-02 20:32:29 +08:00			`"httpx>=0.27.2",`
feat(deep-crawling): add DFS strategy and update exports; refactor CLI entry point 2025-02-09 20:23:40 +08:00			`"fake-useragent>=2.0.3",`
			`"click>=8.1.7",`
feat(crawler): add HTTP crawler strategy for lightweight web scraping Implements a new AsyncHTTPCrawlerStrategy class that provides a fast, memory-efficient alternative to browser-based crawling. Features include: - Support for HTTP/HTTPS requests with configurable methods, headers, and timeouts - File and raw content handling capabilities - Streaming response processing for large files - Customizable request/response hooks - Comprehensive error handling Also refactors browser management code into separate module for better organization. 2025-02-15 19:26:30 +08:00			`"pyperclip>=1.8.2",`
chore: move from faust-cchardet to chardet 2025-04-03 17:42:51 +05:30			`"chardet>=5.2.0",`
feat(cli): add browser profile management functionality Adds new interactive browser profile management system that allows users to: - Create and manage browser profiles for authenticated crawling - List existing profiles with detailed information - Delete unused profiles - Use profiles during crawling with the new -p/--profile flag Also restructures CLI to use Click groups and adds humanize dependency for better size formatting. 2025-03-02 20:54:45 +08:00			`"aiohttp>=3.11.11",`
Chore: Add brotli as dependancy to fix: https://github.com/unclecode/crawl4ai/issues/867 2025-03-25 13:44:41 +05:30			`"brotli>=1.1.0",`
feat(schema): improve HTML preprocessing for schema generation Add new preprocess_html_for_schema utility function to better handle HTML cleaning for schema generation. This replaces the previous optimize_html function in the GoogleSearchCrawler and includes smarter attribute handling and pattern detection. Other changes: - Update default provider to gpt-4o - Add DEFAULT_PROVIDER_API_KEY constant - Make LLMConfig creation more flexible with create_llm_config helper - Add new dependencies: zstandard and msgpack This change improves schema generation reliability while reducing noise in the processed HTML. 2025-03-12 22:40:46 +08:00			`"humanize>=4.10.0",`
build: modernize package configuration with pyproject.toml - Add pyproject.toml for PEP 517 build system support - Configure dependencies, scripts, and metadata in pyproject.toml - Set Python requirement to >=3.9 and add support up to 3.13 - Keep setup.py for backwards compatibility - Move package dependencies and entry points to pyproject.toml 2025-01-01 15:45:27 +08:00			`]`
docs: update project description emojis - Change project description emojis from 🔥🕷️ to 🚀🤖 - Update emojis consistently in both setup.py and pyproject.toml 2025-01-01 15:39:33 +08:00			`classifiers = [`
feat(release): prepare v0.4.3 beta release Prepare the v0.4.3 beta release with major feature additions and improvements: - Add JsonXPathExtractionStrategy and LLMContentFilter to exports - Update version to 0.4.3b1 - Improve documentation for dispatchers and markdown generation - Update development status to Beta - Reorganize changelog format BREAKING CHANGE: Memory threshold in MemoryAdaptiveDispatcher increased to 90% and SemaphoreDispatcher parameter renamed to max_session_permit 2025-01-21 21:03:11 +08:00			`"Development Status :: 4 - Beta",`
docs: update project description emojis - Change project description emojis from 🔥🕷️ to 🚀🤖 - Update emojis consistently in both setup.py and pyproject.toml 2025-01-01 15:39:33 +08:00			`"Intended Audience :: Developers",`
			`"Programming Language :: Python :: 3",`
			`"Programming Language :: Python :: 3.9",`
			`"Programming Language :: Python :: 3.10",`
			`"Programming Language :: Python :: 3.11",`
			`"Programming Language :: Python :: 3.12",`
			`"Programming Language :: Python :: 3.13",`
			`]`

build: modernize package configuration with pyproject.toml - Add pyproject.toml for PEP 517 build system support - Configure dependencies, scripts, and metadata in pyproject.toml - Set Python requirement to >=3.9 and add support up to 3.13 - Keep setup.py for backwards compatibility - Move package dependencies and entry points to pyproject.toml 2025-01-01 15:45:27 +08:00			`[project.optional-dependencies]`
refactor(pdf): improve PDF processor dependency handling Make PyPDF2 an optional dependency and improve import handling in PDF processor. Move imports inside methods to allow for lazy loading and better error handling. Add new 'pdf' optional dependency group in pyproject.toml. Clean up unused imports and remove deprecated files. BREAKING CHANGE: PyPDF2 is now an optional dependency. Users need to install with 'pip install crawl4ai[pdf]' to use PDF processing features. 2025-02-25 22:27:55 +08:00			`pdf = ["PyPDF2"]`
build: modernize package configuration with pyproject.toml - Add pyproject.toml for PEP 517 build system support - Configure dependencies, scripts, and metadata in pyproject.toml - Set Python requirement to >=3.9 and add support up to 3.13 - Keep setup.py for backwards compatibility - Move package dependencies and entry points to pyproject.toml 2025-01-01 15:45:27 +08:00			`torch = ["torch", "nltk", "scikit-learn"]`
			`transformer = ["transformers", "tokenizers"]`
			`cosine = ["torch", "transformers", "nltk"]`
			`sync = ["selenium"]`
			`all = [`
Release prep (#749) * fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown 2025-02-28 17:23:35 +05:30			`"PyPDF2",`
build: modernize package configuration with pyproject.toml - Add pyproject.toml for PEP 517 build system support - Configure dependencies, scripts, and metadata in pyproject.toml - Set Python requirement to >=3.9 and add support up to 3.13 - Keep setup.py for backwards compatibility - Move package dependencies and entry points to pyproject.toml 2025-01-01 15:45:27 +08:00			`"torch",`
			`"nltk",`
			`"scikit-learn",`
			`"transformers",`
			`"tokenizers",`
refactor(pdf): improve PDF processor dependency handling Make PyPDF2 an optional dependency and improve import handling in PDF processor. Move imports inside methods to allow for lazy loading and better error handling. Add new 'pdf' optional dependency group in pyproject.toml. Clean up unused imports and remove deprecated files. BREAKING CHANGE: PyPDF2 is now an optional dependency. Users need to install with 'pip install crawl4ai[pdf]' to use PDF processing features. 2025-02-25 22:27:55 +08:00			`"selenium",`
			`"PyPDF2"`
build: modernize package configuration with pyproject.toml - Add pyproject.toml for PEP 517 build system support - Configure dependencies, scripts, and metadata in pyproject.toml - Set Python requirement to >=3.9 and add support up to 3.13 - Keep setup.py for backwards compatibility - Move package dependencies and entry points to pyproject.toml 2025-01-01 15:45:27 +08:00			`]`

			`[project.scripts]`
			`crawl4ai-download-models = "crawl4ai.model_loader:main"`
			`crawl4ai-migrate = "crawl4ai.migrations:main"`
			`crawl4ai-setup = "crawl4ai.install:post_install"`
feat(install): add doctor command and force browser install - Add --force flag to Playwright browser installation - Add doctor command to test crawling functionality - Install Chrome and Chromium browsers explicitly - Add crawl4ai-doctor entry point in pyproject.toml - Implement simple health check focused on crawling test 2025-01-01 16:33:43 +08:00			`crawl4ai-doctor = "crawl4ai.install:doctor"`
refactor(cli): improve CLI default command handling Make 'crawl' the default command when no command is specified. This improves user experience by allowing direct URL input without explicitly specifying the 'crawl' command. Also removes unnecessary blank lines in example code for better readability. 2025-03-04 20:28:16 +08:00			`crwl = "crawl4ai.cli:main"`
build: modernize package configuration with pyproject.toml - Add pyproject.toml for PEP 517 build system support - Configure dependencies, scripts, and metadata in pyproject.toml - Set Python requirement to >=3.9 and add support up to 3.13 - Keep setup.py for backwards compatibility - Move package dependencies and entry points to pyproject.toml 2025-01-01 15:45:27 +08:00
docs: update project description emojis - Change project description emojis from 🔥🕷️ to 🚀🤖 - Update emojis consistently in both setup.py and pyproject.toml 2025-01-01 15:39:33 +08:00			`[tool.setuptools]`
build: streamline package discovery and bump to v0.4.243 - Replace explicit package listing with setuptools.find - Include all crawl4ai.* packages automatically - Use `packages = {find = {where = ["."], include = ["crawl4ai*"]}}` syntax - Bump version to 0.4.243 This change simplifies package maintenance by automatically discovering all subpackages under crawl4ai namespace instead of listing them manually. 2025-01-01 17:53:51 +08:00			`packages = {find = {where = ["."], include = ["crawl4ai*"]}}`
fix: ensure js_snippet files are included in package - Add js_snippet to packages list in pyproject.toml - Verified JS files are properly included in installed package - Bump version to 0.4.242 2025-01-01 17:38:59 +08:00
			`[tool.setuptools.package-data]`
			`crawl4ai = ["js_snippet/*.js"]`
docs: update project description emojis - Change project description emojis from 🔥🕷️ to 🚀🤖 - Update emojis consistently in both setup.py and pyproject.toml 2025-01-01 15:39:33 +08:00
			`[tool.setuptools.dynamic]`
refactor(browser): improve browser path management Implement more robust browser executable path handling using playwright's built-in browser management. This change: - Adds async browser path resolution - Implements path caching in the home folder - Removes hardcoded browser paths - Adds httpx dependency - Removes obsolete test result files This change makes the browser path resolution more reliable across different platforms and environments. 2025-01-17 22:14:37 +08:00			`version = {attr = "crawl4ai.__version__.__version__"}`

			`[tool.uv.sources]`
			`crawl4ai = { workspace = true }`

			`[dependency-groups]`
			`dev = [`
			`"crawl4ai",`
			`]`