crawl4ai/tests/general/test_deep_crawl_scorers.py

from crawl4ai.deep_crawling.scorers import CompositeScorer, ContentTypeScorer, DomainAuthorityScorer, FreshnessScorer, KeywordRelevanceScorer, PathDepthScorer


def test_scorers():
    test_cases = [
        # Keyword Scorer Tests
        {
            "scorer_type": "keyword",
            "config": {
                "keywords": ["python", "blog"],
                "weight": 1.0,
                "case_sensitive": False
            },
            "urls": {
                "https://example.com/python-blog": 1.0,
                "https://example.com/PYTHON-BLOG": 1.0,
                "https://example.com/python-only": 0.5,
                "https://example.com/other": 0.0
            }
        },

        # Path Depth Scorer Tests
        {
            "scorer_type": "path_depth",
            "config": {
                "optimal_depth": 2,
                "weight": 1.0
            },
            "urls": {
                "https://example.com/a/b": 1.0,
                "https://example.com/a": 0.5,
                "https://example.com/a/b/c": 0.5,
                "https://example.com": 0.33333333
            }
        },

        # Content Type Scorer Tests
        {
            "scorer_type": "content_type",
            "config": {
                "type_weights": {
                    ".html$": 1.0,
                    ".pdf$": 0.8,
                    ".jpg$": 0.6
                },
                "weight": 1.0
            },
            "urls": {
                "https://example.com/doc.html": 1.0,
                "https://example.com/doc.pdf": 0.8,
                "https://example.com/img.jpg": 0.6,
                "https://example.com/other.txt": 0.0
            }
        },

        # Freshness Scorer Tests
        {
            "scorer_type": "freshness",
            "config": {
                "weight": 1.0,  # Remove current_year since original doesn't support it
            },
            "urls": {
                "https://example.com/2024/01/post": 1.0,
                "https://example.com/2023/12/post": 0.9,
                "https://example.com/2022/post": 0.8,
                "https://example.com/no-date": 0.5
            }
        },

        # Domain Authority Scorer Tests
        {
            "scorer_type": "domain",
            "config": {
                "domain_weights": {
                    "python.org": 1.0,
                    "github.com": 0.8,
                    "medium.com": 0.6
                },
                "default_weight": 0.3,
                "weight": 1.0
            },
            "urls": {
                "https://python.org/about": 1.0,
                "https://github.com/repo": 0.8,
                "https://medium.com/post": 0.6,
                "https://unknown.com": 0.3
            }
        }
    ]

    def create_scorer(scorer_type, config):
        if scorer_type == "keyword":
            return KeywordRelevanceScorer(**config)
        elif scorer_type == "path_depth":
            return PathDepthScorer(**config)
        elif scorer_type == "content_type":
            return ContentTypeScorer(**config)
        elif scorer_type == "freshness":
            return FreshnessScorer(**config,current_year=2024)
        elif scorer_type == "domain":
            return DomainAuthorityScorer(**config)

    def run_accuracy_test():
        print("\nAccuracy Tests:")
        print("-" * 50)

        all_passed = True
        for test_case in test_cases:
            print(f"\nTesting {test_case['scorer_type']} scorer:")
            scorer = create_scorer(
                test_case['scorer_type'],
                test_case['config']
            )

            for url, expected in test_case['urls'].items():
                score = round(scorer.score(url), 8)
                expected = round(expected, 8)

                if abs(score - expected) > 0.00001:
                    print(f"❌ Scorer Failed: URL '{url}'")
                    print(f"   Expected: {expected}, Got: {score}")
                    all_passed = False
                else:
                    print(f"✅ Scorer Passed: URL '{url}'")


        return all_passed

    def run_composite_test():
        print("\nTesting Composite Scorer:")
        print("-" * 50)

        # Create test data
        test_urls = {
            "https://python.org/blog/2024/01/new-release.html":0.86666667,
            "https://github.com/repo/old-code.pdf": 0.62,
            "https://unknown.com/random": 0.26
        }

        # Create composite scorers with all types
        scorers = []

        for test_case in test_cases:
            scorer = create_scorer(
                test_case['scorer_type'],
                test_case['config']
            )
            scorers.append(scorer)

        composite = CompositeScorer(scorers, normalize=True)

        all_passed = True
        for url, expected in test_urls.items():
            score = round(composite.score(url), 8)

            if abs(score - expected) > 0.00001:
                print(f"❌ Composite Failed: URL '{url}'")
                print(f"   Expected: {expected}, Got: {score}")
                all_passed = False
            else:
                print(f"✅ Composite Passed: URL '{url}'")

        return all_passed

    # Run tests
    print("Running Scorer Tests...")
    accuracy_passed = run_accuracy_test()
    composite_passed = run_composite_test()

    if accuracy_passed and composite_passed:
        print("\n✨ All tests passed!")
        # Note: Already have performance tests in run_scorer_performance_test()
    else:
        print("\n❌ Some tests failed!")


if __name__ == "__main__":
    test_scorers()