- User agent

- Extract Links
- Extract Metadata
- Update Readme
- Update REST API document
This commit is contained in:
unclecode 2024-06-08 17:59:42 +08:00
parent 9c34b30723
commit b3a0edaa6d
12 changed files with 155 additions and 75 deletions

View File

@ -1 +1,4 @@
# Changelog
# Changelog
## TODO:
- User agent: "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.101.76 Safari/537.36",

View File

@ -14,6 +14,9 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
### v0.2.3
- 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media`
- 🔗 Extrat all external and internal links. Check `result.links`
- 📚 Extract metadata from the page. Check `result.metadata`
- 🕵️ Support `user_agent` parameter to set the user agent for the HTTP requests.
- 🖼️ Take [screenshots](#taking-screenshots) of the page.
### v0.2.2
@ -32,7 +35,7 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
## Power and Simplicity of Crawl4AI 🚀
The most easy way! If you don't want to install any library, you can use the REST API on my server. But remember, this is just a simple server. I may improve its capacity if I see there is demand.
The most easy way! If you don't want to install any library, you can use the REST API on my server. But remember, this is just a simple server. I may improve its capacity if I see there is demand. You can find ll examples of REST API in this colab notebook. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1zODYjhemJ5bUmYceWpVoBMVpd0ofzNBZ?usp=sharing)
```python
import requests
@ -41,7 +44,6 @@ data = {
"urls": [
"https://www.nbcnews.com/business"
],
"word_count_threshold": 5,
"screenshot": True
}
@ -242,8 +244,12 @@ To use the REST API, send a POST request to `http://localhost:8000/crawl` with t
"url": "https://www.nbcnews.com/business",
"extracted_content": "...",
"html": "...",
"cleaned_html": "...",
"markdown": "...",
"metadata": {...}
"media": {...},
"links": {...},
"metadata": {...},
"screenshots": "...",
}
]
}
@ -282,6 +288,24 @@ Crawl result without raw HTML content:
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
```
### Result Structure
The result object contains the following fields:
```python
class CrawlResult(BaseModel):
url: str
html: str
success: bool
cleaned_html: Optional[str] = None
media: Dict[str, List[Dict]] = {} # Media tags in the page {"images": [], "audio": [], "video": []}
links: Dict[str, List[Dict]] = {} # Links in the page {"external": [], "internal": []}
screenshot: Optional[str] = None # Base64 encoded screenshot
markdown: Optional[str] = None
extracted_content: Optional[str] = None
metadata: Optional[dict] = None
error_message: Optional[str] = None
```
### Taking Screenshots
```python
@ -401,6 +425,7 @@ result = crawler.run(url="https://www.nbcnews.com/business")
| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `NoExtractionStrategy` |
| `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` |
| `css_selector` | The CSS selector to target specific parts of the HTML for extraction. | No | `None` |
| `user_agent` | The user agent to use for the HTTP requests. | No | `Mozilla/5.0` |
| `verbose` | Whether to enable verbose logging. | No | `true` |
## Chunking Strategies 📚

View File

@ -44,6 +44,10 @@ class CrawlerStrategy(ABC):
@abstractmethod
def take_screenshot(self, save_path: str):
pass
@abstractmethod
def update_user_agent(self, user_agent: str):
pass
class CloudCrawlerStrategy(CrawlerStrategy):
def __init__(self, use_cached_html = False):
@ -69,6 +73,8 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
self.options = Options()
self.options.headless = True
if kwargs.get("user_agent"):
self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
self.options.add_argument("--no-sandbox")
self.options.add_argument("--headless")
# self.options.add_argument("--disable-dev-shm-usage")
@ -97,6 +103,11 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
self.service.log_path = "NUL"
self.driver = webdriver.Chrome(service=self.service, options=self.options)
def update_user_agent(self, user_agent: str):
self.options.add_argument(f"user-agent={user_agent}")
self.driver.quit()
self.driver = webdriver.Chrome(service=self.service, options=self.options)
def crawl(self, url: str) -> str:
if self.use_cached_html:
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))

View File

@ -21,6 +21,7 @@ def init_db():
success BOOLEAN,
media TEXT DEFAULT "{}",
link TEXT DEFAULT "{}",
metadata TEXT DEFAULT "{}",
screenshot TEXT DEFAULT ""
)
''')
@ -42,12 +43,12 @@ def check_db_path():
if not DB_PATH:
raise ValueError("Database path is not set or is empty.")
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, bool, str]]:
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot FROM crawled_data WHERE url = ?', (url,))
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', (url,))
result = cursor.fetchone()
conn.close()
return result
@ -55,14 +56,14 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, boo
print(f"Error retrieving cached URL: {e}")
return None
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", screenshot: str = ""):
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", metadata : str = "{}", screenshot: str = ""):
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('''
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
html = excluded.html,
cleaned_html = excluded.cleaned_html,
@ -70,9 +71,10 @@ def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_c
extracted_content = excluded.extracted_content,
success = excluded.success,
media = excluded.media,
links = excluded.links,
links = excluded.links,
metadata = excluded.metadata,
screenshot = excluded.screenshot
''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, screenshot))
''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
conn.commit()
conn.close()
except Exception as e:
@ -126,5 +128,5 @@ def update_existing_records(new_column: str = "media", default_value: str = "{}"
if __name__ == "__main__":
init_db() # Initialize the database if not already initialized
alter_db_add_screenshot("links") # Add the new column to the table
update_existing_records("links") # Update existing records to set the new column to an empty string
alter_db_add_screenshot("metadata") # Add the new column to the table
update_existing_records("metadata") # Update existing records to set the new column to an empty string

View File

@ -359,6 +359,47 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
print('Error processing HTML content:', str(e))
raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e
def extract_metadata(html):
metadata = {}
if not html:
return metadata
# Parse HTML content with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
# Title
title_tag = soup.find('title')
metadata['title'] = title_tag.string if title_tag else None
# Meta description
description_tag = soup.find('meta', attrs={'name': 'description'})
metadata['description'] = description_tag['content'] if description_tag else None
# Meta keywords
keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
metadata['keywords'] = keywords_tag['content'] if keywords_tag else None
# Meta author
author_tag = soup.find('meta', attrs={'name': 'author'})
metadata['author'] = author_tag['content'] if author_tag else None
# Open Graph metadata
og_tags = soup.find_all('meta', attrs={'property': lambda value: value and value.startswith('og:')})
for tag in og_tags:
property_name = tag['property']
metadata[property_name] = tag['content']
# Twitter Card metadata
twitter_tags = soup.find_all('meta', attrs={'name': lambda value: value and value.startswith('twitter:')})
for tag in twitter_tags:
property_name = tag['name']
metadata[property_name] = tag['content']
return metadata
def extract_xml_tags(string):
tags = re.findall(r'<(\w+)>', string)
return list(set(tags))

View File

@ -89,8 +89,11 @@ class WebCrawler:
css_selector: str = None,
screenshot: bool = False,
verbose=True,
user_agent: str = None,
**kwargs,
) -> CrawlResult:
if user_agent:
self.crawler_strategy.update_user_agent(user_agent)
extraction_strategy = extraction_strategy or NoExtractionStrategy()
extraction_strategy.verbose = verbose
# Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
@ -117,7 +120,8 @@ class WebCrawler:
"success": cached[5],
"media": json.loads(cached[6] or "{}"),
"links": json.loads(cached[7] or "{}"),
"screenshot": cached[8],
"metadata": json.loads(cached[8] or "{}"), # "metadata": "{}
"screenshot": cached[9],
"error_message": "",
}
)
@ -135,6 +139,7 @@ class WebCrawler:
# Extract content from HTML
try:
result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
metadata = extract_metadata(html)
if result is None:
raise ValueError(f"Failed to extract content from the website: {url}")
except InvalidCSSSelectorError as e:
@ -180,6 +185,7 @@ class WebCrawler:
success,
json.dumps(media),
json.dumps(links),
json.dumps(metadata),
screenshot=base64_image,
)
@ -190,6 +196,7 @@ class WebCrawler:
markdown=markdown,
media=media,
links=links,
metadata=metadata,
screenshot=base64_image,
extracted_content=extracted_content,
success=success,

Binary file not shown.

Before

Width:  |  Height:  |  Size: 344 KiB

After

Width:  |  Height:  |  Size: 372 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 537 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 419 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 485 KiB

View File

@ -1,75 +1,64 @@
import requests, base64, os
data = {
"urls": ["https://www.nbcnews.com/business"],
"screenshot": True,
}
response = requests.post("https://crawl4ai.com/crawl", json=data)
result = response.json()['results'][0]
print(result.keys())
# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media',
# 'links', 'screenshot', 'markdown', 'extracted_content',
# 'metadata', 'error_message'])
with open("screenshot.png", "wb") as f:
f.write(base64.b64decode(result['screenshot']))
# Example of filtering the content using CSS selectors
data = {
"urls": [
"https://www.nbcnews.com/business"
],
"css_selector": "article",
"screenshot": True,
}
# Example of executing a JS script on the page before extracting the content
data = {
"urls": [
"https://www.nbcnews.com/business"
],
"screenshot": True,
'js' : ["""
const loadMoreButton = Array.from(document.querySelectorAll('button')).
find(button => button.textContent.includes('Load More'));
loadMoreButton && loadMoreButton.click();
"""]
}
# Example of filtering the content using CSS selectors
# data = {
# "urls": [
# "https://www.nbcnews.com/business"
# ],
# "css_selector": "article",
# "screenshot": True,
# }
# Example of executing a JS script on the page before extracting the content
# data = {
# "urls": [
# "https://www.nbcnews.com/business"
# ],
# "screenshot": True,
# 'js' : ["""
# const loadMoreButton = Array.from(document.querySelectorAll('button')).
# find(button => button.textContent.includes('Load More'));
# loadMoreButton && loadMoreButton.click();
# """]
# }
# Example of using a custom extraction strategy
# data = {
# "urls": [
# "https://www.nbcnews.com/business"
# ],
# "extraction_strategy": "CosineStrategy",
# "extraction_strategy_args": {
# "semantic_filter": "inflation rent prices"
# },
# }
data = {
"urls": [
"https://www.nbcnews.com/business"
],
"extraction_strategy": "CosineStrategy",
"extraction_strategy_args": {
"semantic_filter": "inflation rent prices"
},
}
# Example of using LLM to extract content
# data = {
# "urls": [
# "https://www.nbcnews.com/business"
# ],
# "extraction_strategy": "LLMExtractionStrategy",
# "extraction_strategy_args": {
# "provider": "groq/llama3-8b-8192",
# "api_token": os.environ.get("GROQ_API_KEY"),
# "instruction": """I am interested in only financial news,
# and translate them in French."""
# },
# }
response = requests.post("https://crawl4ai.com/crawl", json=data)
result = response.json()['results'][0]
print(result['markdown'])
print(result['cleaned_html'])
print(result['media'])
print(result['extracted_content'])
with open("screenshot.png", "wb") as f:
f.write(base64.b64decode(result['screenshot']))
data = {
"urls": [
"https://www.nbcnews.com/business"
],
"extraction_strategy": "LLMExtractionStrategy",
"extraction_strategy_args": {
"provider": "groq/llama3-8b-8192",
"api_token": os.environ.get("GROQ_API_KEY"),
"instruction": """I am interested in only financial news,
and translate them in French."""
},
}

View File

@ -57,6 +57,7 @@ class CrawlRequest(BaseModel):
chunking_strategy_args: Optional[dict] = {}
css_selector: Optional[str] = None
screenshot: Optional[bool] = False
user_agent: Optional[str] = None
verbose: Optional[bool] = True
@ -127,6 +128,7 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
crawl_request.bypass_cache,
crawl_request.css_selector,
crawl_request.screenshot,
crawl_request.user_agent,
crawl_request.verbose
)
for url in crawl_request.urls