haystack/docs-website/scripts/generate_requirements.py

#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

"""
Generate requirements.txt from Haystack's pyproject.toml for docs snippet testing.

This script fetches the pyproject.toml from a specific Haystack version or branch,
parses it, and generates a requirements.txt with all dependencies needed
to run the Python code snippets in the documentation.
"""

import argparse
import sys
from pathlib import Path

import requests
import toml

_VERSION_SPLITTERS = ("[", "==", ">=", "<", "!=", "~=")


def _package_name(dep: str) -> str:
    """Return the dependency name stripped of extras and version specifiers."""

    candidate = dep
    for splitter in _VERSION_SPLITTERS:
        candidate = candidate.split(splitter)[0]
    return candidate.strip()


def fetch_haystack_deps(version="main"):
    """
    Fetch and parse Haystack's pyproject.toml to extract dependencies.

    Args:
        version: Haystack version (e.g., "2.16.1", "main", "develop")
    """
    if version == "main":
        url = "https://raw.githubusercontent.com/deepset-ai/haystack/refs/heads/main/pyproject.toml"
    elif version == "develop":
        url = "https://raw.githubusercontent.com/deepset-ai/haystack/refs/heads/develop/pyproject.toml"
    else:
        # Format version tag properly (add 'v' prefix if not present)
        if not version.startswith("v"):
            version = f"v{version}"
        url = f"https://raw.githubusercontent.com/deepset-ai/haystack/refs/tags/{version}/pyproject.toml"

    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Failed to fetch pyproject.toml for version {version}: {e}")
        print(f"URL: {url}")
        sys.exit(1)

    try:
        config = toml.loads(response.text)
    except toml.TomlDecodeError as e:
        print(f"Failed to parse pyproject.toml: {e}")
        sys.exit(1)

    # Core project dependencies
    core_deps = config.get("project", {}).get("dependencies", [])

    # Test environment dependencies (most comprehensive for docs testing)
    test_env = config.get("tool", {}).get("hatch", {}).get("envs", {}).get("test", {})
    test_deps = test_env.get("dependencies", []) if test_env else []

    # E2E environment dependencies (additional components)
    e2e_env = config.get("tool", {}).get("hatch", {}).get("envs", {}).get("e2e", {})
    e2e_deps = e2e_env.get("dependencies", []) if e2e_env else []

    # Combine all dependencies
    all_deps = []
    all_deps.extend(core_deps)
    all_deps.extend(test_deps)
    all_deps.extend(e2e_deps)

    # Remove duplicates while preserving order
    seen = set()
    unique_deps = []
    for dep in all_deps:
        package_name = _package_name(dep)
        if package_name not in seen:
            seen.add(package_name)
            unique_deps.append(dep)

    # Filter out test-only dependencies that aren't needed for docs
    test_only_packages = {
        "pytest",
        "pytest-bdd",
        "pytest-cov",
        "pytest-asyncio",
        "pytest-rerunfailures",
        "coverage",
        "mypy",
        "pylint",
        "ipython",
        "colorama",
    }

    filtered_deps = []
    for dep in unique_deps:
        package_name = _package_name(dep)
        if package_name.lower() not in test_only_packages:
            filtered_deps.append(dep)

    return filtered_deps


def main():
    """Entry point for generating requirements for docs snippet tests."""
    parser = argparse.ArgumentParser(
        description="Generate requirements.txt from Haystack's pyproject.toml for docs snippet testing"
    )
    parser.add_argument(
        "--version",
        "-v",
        default="main",
        help="Haystack version to fetch dependencies for (e.g., '2.16.1', 'main', 'develop'). Default: main",
    )
    parser.add_argument(
        "--output", "-o", default="requirements.txt", help="Output file path. Default: requirements.txt"
    )
    parser.add_argument("--verbose", action="store_true", help="Show detailed output including all dependencies")

    args = parser.parse_args()

    print(f"Fetching Haystack dependencies for version: {args.version}")
    deps = fetch_haystack_deps(args.version)

    requirements_content = f"""# Auto-generated from Haystack pyproject.toml (version: {args.version})
# For testing docs snippets
# Generated by scripts/generate_requirements.py
"""

    for dep in sorted(deps):
        requirements_content += f"{dep}\n"

    requirements_path = Path(args.output)
    requirements_path.write_text(requirements_content)

    print(f"Generated {requirements_path} with {len(deps)} dependencies")

    if args.verbose:
        print("\nTop-level dependencies:")
        for dep in sorted(deps)[:10]:  # Show first 10
            print(f"  {dep}")
        if len(deps) > 10:
            print(f"  ... and {len(deps) - 10} more")


if __name__ == "__main__":
    main()