mirror of
https://github.com/unclecode/crawl4ai.git
synced 2025-12-28 10:58:29 +00:00
Update LLMExtractionStrategy to disable chunking if specified, Add example of summarization for a web page.
This commit is contained in:
parent
1fcb573909
commit
21b110bfd7
@ -77,6 +77,9 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD)
|
||||
self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
|
||||
self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
|
||||
self.apply_chunking = kwargs.get("apply_chunking", True)
|
||||
if not self.apply_chunking:
|
||||
self.chunk_token_threshold = 1e9
|
||||
|
||||
self.verbose = kwargs.get("verbose", False)
|
||||
|
||||
|
||||
46
docs/examples/summarize_page.py
Normal file
46
docs/examples/summarize_page.py
Normal file
@ -0,0 +1,46 @@
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import *
|
||||
from crawl4ai.extraction_strategy import *
|
||||
from crawl4ai.crawler_strategy import *
|
||||
|
||||
url = r'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot'
|
||||
|
||||
crawler = WebCrawler()
|
||||
crawler.warmup()
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
class PageSummary(BaseModel):
|
||||
title: str = Field(..., description="Title of the page.")
|
||||
summary: str = Field(..., description="Summary of the page.")
|
||||
brief_summary: str = Field(..., description="Brief summary of the page.")
|
||||
keywords: list = Field(..., description="Keywords assigned to the page.")
|
||||
|
||||
result = crawler.run(
|
||||
url=url,
|
||||
word_count_threshold=1,
|
||||
extraction_strategy= LLMExtractionStrategy(
|
||||
provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
|
||||
schema=PageSummary.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
apply_chunking =False,
|
||||
instruction="From the crawled content, extract the following details: "\
|
||||
"1. Title of the page "\
|
||||
"2. Summary of the page, which is a detailed summary "\
|
||||
"3. Brief summary of the page, which is a paragraph text "\
|
||||
"4. Keywords assigned to the page, which is a list of keywords. "\
|
||||
'The extracted JSON format should look like this: '\
|
||||
'{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }'
|
||||
),
|
||||
bypass_cache=True,
|
||||
)
|
||||
|
||||
page_summary = json.loads(result.extracted_content)
|
||||
|
||||
print(page_summary)
|
||||
|
||||
with open(".data/page_summary.json", "w") as f:
|
||||
f.write(result.extracted_content)
|
||||
Loading…
x
Reference in New Issue
Block a user