76 lines
1.7 KiB
Python
76 lines
1.7 KiB
Python
|
|
import requests, base64, os
|
|
|
|
data = {
|
|
"urls": [
|
|
"https://www.nbcnews.com/business"
|
|
],
|
|
"screenshot": True,
|
|
}
|
|
|
|
# Example of filtering the content using CSS selectors
|
|
# data = {
|
|
# "urls": [
|
|
# "https://www.nbcnews.com/business"
|
|
# ],
|
|
# "css_selector": "article",
|
|
# "screenshot": True,
|
|
# }
|
|
|
|
# Example of executing a JS script on the page before extracting the content
|
|
# data = {
|
|
# "urls": [
|
|
# "https://www.nbcnews.com/business"
|
|
# ],
|
|
# "screenshot": True,
|
|
# 'js' : ["""
|
|
# const loadMoreButton = Array.from(document.querySelectorAll('button')).
|
|
# find(button => button.textContent.includes('Load More'));
|
|
# loadMoreButton && loadMoreButton.click();
|
|
# """]
|
|
# }
|
|
|
|
# Example of using a custom extraction strategy
|
|
# data = {
|
|
# "urls": [
|
|
# "https://www.nbcnews.com/business"
|
|
# ],
|
|
# "extraction_strategy": "CosineStrategy",
|
|
# "extraction_strategy_args": {
|
|
# "semantic_filter": "inflation rent prices"
|
|
# },
|
|
# }
|
|
|
|
# Example of using LLM to extract content
|
|
# data = {
|
|
# "urls": [
|
|
# "https://www.nbcnews.com/business"
|
|
# ],
|
|
# "extraction_strategy": "LLMExtractionStrategy",
|
|
# "extraction_strategy_args": {
|
|
# "provider": "groq/llama3-8b-8192",
|
|
# "api_token": os.environ.get("GROQ_API_KEY"),
|
|
# "instruction": """I am interested in only financial news,
|
|
# and translate them in French."""
|
|
# },
|
|
# }
|
|
|
|
response = requests.post("https://crawl4ai.com/crawl", json=data)
|
|
result = response.json()['results'][0]
|
|
|
|
print(result['markdown'])
|
|
print(result['cleaned_html'])
|
|
print(result['media'])
|
|
print(result['extracted_content'])
|
|
with open("screenshot.png", "wb") as f:
|
|
f.write(base64.b64decode(result['screenshot']))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|