mirror of
https://github.com/unclecode/crawl4ai.git
synced 2025-10-10 10:00:23 +00:00
207 lines
5.5 KiB
Markdown
207 lines
5.5 KiB
Markdown
![]() |
# Page Interaction
|
||
|
|
||
|
Crawl4AI provides powerful features for interacting with dynamic webpages, handling JavaScript execution, and managing page events.
|
||
|
|
||
|
## JavaScript Execution
|
||
|
|
||
|
### Basic Execution
|
||
|
|
||
|
```python
|
||
|
# Single JavaScript command
|
||
|
result = await crawler.arun(
|
||
|
url="https://example.com",
|
||
|
js_code="window.scrollTo(0, document.body.scrollHeight);"
|
||
|
)
|
||
|
|
||
|
# Multiple commands
|
||
|
js_commands = [
|
||
|
"window.scrollTo(0, document.body.scrollHeight);",
|
||
|
"document.querySelector('.load-more').click();",
|
||
|
"document.querySelector('#consent-button').click();"
|
||
|
]
|
||
|
result = await crawler.arun(
|
||
|
url="https://example.com",
|
||
|
js_code=js_commands
|
||
|
)
|
||
|
```
|
||
|
|
||
|
## Wait Conditions
|
||
|
|
||
|
### CSS-Based Waiting
|
||
|
|
||
|
Wait for elements to appear:
|
||
|
|
||
|
```python
|
||
|
result = await crawler.arun(
|
||
|
url="https://example.com",
|
||
|
wait_for="css:.dynamic-content" # Wait for element with class 'dynamic-content'
|
||
|
)
|
||
|
```
|
||
|
|
||
|
### JavaScript-Based Waiting
|
||
|
|
||
|
Wait for custom conditions:
|
||
|
|
||
|
```python
|
||
|
# Wait for number of elements
|
||
|
wait_condition = """() => {
|
||
|
return document.querySelectorAll('.item').length > 10;
|
||
|
}"""
|
||
|
|
||
|
result = await crawler.arun(
|
||
|
url="https://example.com",
|
||
|
wait_for=f"js:{wait_condition}"
|
||
|
)
|
||
|
|
||
|
# Wait for dynamic content to load
|
||
|
wait_for_content = """() => {
|
||
|
const content = document.querySelector('.content');
|
||
|
return content && content.innerText.length > 100;
|
||
|
}"""
|
||
|
|
||
|
result = await crawler.arun(
|
||
|
url="https://example.com",
|
||
|
wait_for=f"js:{wait_for_content}"
|
||
|
)
|
||
|
```
|
||
|
|
||
|
## Handling Dynamic Content
|
||
|
|
||
|
### Load More Content
|
||
|
|
||
|
Handle infinite scroll or load more buttons:
|
||
|
|
||
|
```python
|
||
|
# Scroll and wait pattern
|
||
|
result = await crawler.arun(
|
||
|
url="https://example.com",
|
||
|
js_code=[
|
||
|
# Scroll to bottom
|
||
|
"window.scrollTo(0, document.body.scrollHeight);",
|
||
|
# Click load more if exists
|
||
|
"const loadMore = document.querySelector('.load-more'); if(loadMore) loadMore.click();"
|
||
|
],
|
||
|
# Wait for new content
|
||
|
wait_for="js:() => document.querySelectorAll('.item').length > previousCount"
|
||
|
)
|
||
|
```
|
||
|
|
||
|
### Form Interaction
|
||
|
|
||
|
Handle forms and inputs:
|
||
|
|
||
|
```python
|
||
|
js_form_interaction = """
|
||
|
// Fill form fields
|
||
|
document.querySelector('#search').value = 'search term';
|
||
|
// Submit form
|
||
|
document.querySelector('form').submit();
|
||
|
"""
|
||
|
|
||
|
result = await crawler.arun(
|
||
|
url="https://example.com",
|
||
|
js_code=js_form_interaction,
|
||
|
wait_for="css:.results" # Wait for results to load
|
||
|
)
|
||
|
```
|
||
|
|
||
|
## Timing Control
|
||
|
|
||
|
### Delays and Timeouts
|
||
|
|
||
|
Control timing of interactions:
|
||
|
|
||
|
```python
|
||
|
result = await crawler.arun(
|
||
|
url="https://example.com",
|
||
|
page_timeout=60000, # Page load timeout (ms)
|
||
|
delay_before_return_html=2.0, # Wait before capturing content
|
||
|
)
|
||
|
```
|
||
|
|
||
|
## Complex Interactions Example
|
||
|
|
||
|
Here's an example of handling a dynamic page with multiple interactions:
|
||
|
|
||
|
```python
|
||
|
async def crawl_dynamic_content():
|
||
|
async with AsyncWebCrawler() as crawler:
|
||
|
# Initial page load
|
||
|
result = await crawler.arun(
|
||
|
url="https://example.com",
|
||
|
# Handle cookie consent
|
||
|
js_code="document.querySelector('.cookie-accept')?.click();",
|
||
|
wait_for="css:.main-content"
|
||
|
)
|
||
|
|
||
|
# Load more content
|
||
|
session_id = "dynamic_session" # Keep session for multiple interactions
|
||
|
|
||
|
for page in range(3): # Load 3 pages of content
|
||
|
result = await crawler.arun(
|
||
|
url="https://example.com",
|
||
|
session_id=session_id,
|
||
|
js_code=[
|
||
|
# Scroll to bottom
|
||
|
"window.scrollTo(0, document.body.scrollHeight);",
|
||
|
# Store current item count
|
||
|
"window.previousCount = document.querySelectorAll('.item').length;",
|
||
|
# Click load more
|
||
|
"document.querySelector('.load-more')?.click();"
|
||
|
],
|
||
|
# Wait for new items
|
||
|
wait_for="""() => {
|
||
|
const currentCount = document.querySelectorAll('.item').length;
|
||
|
return currentCount > window.previousCount;
|
||
|
}""",
|
||
|
# Only execute JS without reloading page
|
||
|
js_only=True if page > 0 else False
|
||
|
)
|
||
|
|
||
|
# Process content after each load
|
||
|
print(f"Page {page + 1} items:", len(result.cleaned_html))
|
||
|
|
||
|
# Clean up session
|
||
|
await crawler.crawler_strategy.kill_session(session_id)
|
||
|
```
|
||
|
|
||
|
## Using with Extraction Strategies
|
||
|
|
||
|
Combine page interaction with structured extraction:
|
||
|
|
||
|
```python
|
||
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
|
||
|
|
||
|
# Pattern-based extraction after interaction
|
||
|
schema = {
|
||
|
"name": "Dynamic Items",
|
||
|
"baseSelector": ".item",
|
||
|
"fields": [
|
||
|
{"name": "title", "selector": "h2", "type": "text"},
|
||
|
{"name": "description", "selector": ".desc", "type": "text"}
|
||
|
]
|
||
|
}
|
||
|
|
||
|
result = await crawler.arun(
|
||
|
url="https://example.com",
|
||
|
js_code="window.scrollTo(0, document.body.scrollHeight);",
|
||
|
wait_for="css:.item:nth-child(10)", # Wait for 10 items
|
||
|
extraction_strategy=JsonCssExtractionStrategy(schema)
|
||
|
)
|
||
|
|
||
|
# Or use LLM to analyze dynamic content
|
||
|
class ContentAnalysis(BaseModel):
|
||
|
topics: List[str]
|
||
|
summary: str
|
||
|
|
||
|
result = await crawler.arun(
|
||
|
url="https://example.com",
|
||
|
js_code="document.querySelector('.show-more').click();",
|
||
|
wait_for="css:.full-content",
|
||
|
extraction_strategy=LLMExtractionStrategy(
|
||
|
provider="ollama/nemotron",
|
||
|
schema=ContentAnalysis.schema(),
|
||
|
instruction="Analyze the full content"
|
||
|
)
|
||
|
)
|
||
|
```
|