crawl4ai/tests/general/test_schema_builder.py
unclecode 66ac07b4f3 feat(crawler): add network request and console message capturing
Implement comprehensive network request and console message capturing functionality:
- Add capture_network_requests and capture_console_messages config parameters
- Add network_requests and console_messages fields to models
- Implement Playwright event listeners to capture requests, responses, and console output
- Create detailed documentation and examples
- Add comprehensive tests

This feature enables deep visibility into web page activity for debugging,
security analysis, performance profiling, and API discovery in web applications.
2025-04-10 16:03:48 +08:00

112 lines
5.2 KiB
Python

# https://claude.ai/chat/c4bbe93d-fb54-44ce-92af-76b4c8086c6b
# https://claude.ai/chat/c24a768c-d8b2-478a-acc7-d76d42a308da
import os, sys
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
from crawl4ai.utils import preprocess_html_for_schema, JsonXPathExtractionStrategy
import json
# Test HTML - A complex job board with companies, departments, and positions
test_html = """
<div class="company-listings">
<div class="company" data-company-id="123">
<div class="company-header">
<img class="company-logo" src="google.png" alt="Google">
<h1 class="company-name">Google</h1>
<div class="company-meta">
<span class="company-size">10,000+ employees</span>
<span class="company-industry">Technology</span>
<a href="https://google.careers" class="careers-link">Careers Page</a>
</div>
</div>
<div class="departments">
<div class="department">
<h2 class="department-name">Engineering</h2>
<div class="positions">
<div class="position-card" data-position-id="eng-1">
<h3 class="position-title">Senior Software Engineer</h3>
<span class="salary-range">$150,000 - $250,000</span>
<div class="position-meta">
<span class="location">Mountain View, CA</span>
<span class="job-type">Full-time</span>
<span class="experience">5+ years</span>
</div>
<div class="skills-required">
<span class="skill">Python</span>
<span class="skill">Kubernetes</span>
<span class="skill">Machine Learning</span>
</div>
<p class="position-description">Join our core engineering team...</p>
<div class="application-info">
<span class="posting-date">Posted: 2024-03-15</span>
<button class="apply-btn" data-req-id="REQ12345">Apply Now</button>
</div>
</div>
<!-- More positions -->
</div>
</div>
<div class="department">
<h2 class="department-name">Marketing</h2>
<div class="positions">
<div class="position-card" data-position-id="mkt-1">
<h3 class="position-title">Growth Marketing Manager</h3>
<span class="salary-range">$120,000 - $180,000</span>
<div class="position-meta">
<span class="location">New York, NY</span>
<span class="job-type">Full-time</span>
<span class="experience">3+ years</span>
</div>
<div class="skills-required">
<span class="skill">SEO</span>
<span class="skill">Analytics</span>
<span class="skill">Content Strategy</span>
</div>
<p class="position-description">Drive our growth initiatives...</p>
<div class="application-info">
<span class="posting-date">Posted: 2024-03-14</span>
<button class="apply-btn" data-req-id="REQ12346">Apply Now</button>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
"""
# Test cases
def test_schema_generation():
# Test 1: No query (should extract everything)
print("\nTest 1: No Query (Full Schema)")
schema1 = JsonCssExtractionStrategy.generate_schema(test_html)
print(json.dumps(schema1, indent=2))
# Test 2: Query for just basic job info
print("\nTest 2: Basic Job Info Query")
query2 = "I only need job titles, salaries, and locations"
schema2 = JsonCssExtractionStrategy.generate_schema(test_html, query2)
print(json.dumps(schema2, indent=2))
# Test 3: Query for company and department structure
print("\nTest 3: Organizational Structure Query")
query3 = "Extract company details and department names, without position details"
schema3 = JsonCssExtractionStrategy.generate_schema(test_html, query3)
print(json.dumps(schema3, indent=2))
# Test 4: Query for specific skills tracking
print("\nTest 4: Skills Analysis Query")
query4 = "I want to analyze required skills across all positions"
schema4 = JsonCssExtractionStrategy.generate_schema(test_html, query4)
print(json.dumps(schema4, indent=2))
if __name__ == "__main__":
test_schema_generation()