Update for v0.2.2
- Support multiple JS scripts - Fixed some of bugs - Resolved a few issue relevant to Colab installation
This commit is contained in:
parent
f1b60b2016
commit
51f26d12fe
@ -10,8 +10,14 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
|
||||
|
||||
[](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)
|
||||
|
||||
## Recent Changes v0.2.0
|
||||
## Recent Changes
|
||||
|
||||
### v0.2.2
|
||||
- Support multiple JS scripts
|
||||
- Fixed some of bugs
|
||||
- Resolved a few issue relevant to Colab installation
|
||||
|
||||
### v0.2.0
|
||||
- 🚀 10x faster!!
|
||||
- 📜 Execute custom JavaScript before crawling!
|
||||
- 🤝 Colab friendly!
|
||||
|
@ -103,12 +103,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
)
|
||||
|
||||
# Execute JS code if provided
|
||||
if self.js_code:
|
||||
if self.js_code and type(self.js_code) == str:
|
||||
self.driver.execute_script(self.js_code)
|
||||
# Optionally, wait for some condition after executing the JS code
|
||||
WebDriverWait(self.driver, 10).until(
|
||||
lambda driver: driver.execute_script("return document.readyState") == "complete"
|
||||
)
|
||||
elif self.js_code and type(self.js_code) == list:
|
||||
for js in self.js_code:
|
||||
self.driver.execute_script(js)
|
||||
WebDriverWait(self.driver, 10).until(
|
||||
lambda driver: driver.execute_script("return document.readyState") == "complete"
|
||||
)
|
||||
|
||||
html = self.driver.page_source
|
||||
|
||||
|
@ -188,14 +188,15 @@ class CosineStrategy(ExtractionStrategy):
|
||||
if self.verbose:
|
||||
print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
|
||||
|
||||
if False and self.device.type == "cpu":
|
||||
self.model = load_onnx_all_MiniLM_l6_v2()
|
||||
self.tokenizer = self.model.tokenizer
|
||||
self.get_embedding_method = "direct"
|
||||
else:
|
||||
self.tokenizer, self.model = load_bge_small_en_v1_5()
|
||||
self.model.eval()
|
||||
self.get_embedding_method = "batch"
|
||||
# if False and self.device.type == "cpu":
|
||||
# self.model = load_onnx_all_MiniLM_l6_v2()
|
||||
# self.tokenizer = self.model.tokenizer
|
||||
# self.get_embedding_method = "direct"
|
||||
# else:
|
||||
|
||||
self.tokenizer, self.model = load_bge_small_en_v1_5()
|
||||
self.model.eval()
|
||||
self.get_embedding_method = "batch"
|
||||
|
||||
self.buffer_embeddings = np.array([])
|
||||
|
||||
|
@ -35,8 +35,7 @@ def calculate_batch_size(device):
|
||||
else:
|
||||
return 32
|
||||
else:
|
||||
return 16 # Default batch size
|
||||
|
||||
return 16 # Default batch size
|
||||
|
||||
@lru_cache()
|
||||
def get_device():
|
||||
@ -258,8 +257,8 @@ def download_all_models(remove_existing=False):
|
||||
# load_bert_base_uncased()
|
||||
# print("[LOG] Downloading BGE Small EN v1.5...")
|
||||
# load_bge_small_en_v1_5()
|
||||
print("[LOG] Downloading ONNX model...")
|
||||
load_onnx_all_MiniLM_l6_v2()
|
||||
# print("[LOG] Downloading ONNX model...")
|
||||
# load_onnx_all_MiniLM_l6_v2()
|
||||
print("[LOG] Downloading text classifier...")
|
||||
_, device = load_text_multilabel_classifier()
|
||||
print(f"[LOG] Text classifier loaded on {device}")
|
||||
|
@ -164,6 +164,22 @@ def interactive_extraction(crawler):
|
||||
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
def multiple_scrip(crawler):
|
||||
# Passing JavaScript code to interact with the page
|
||||
cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
|
||||
cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
|
||||
js_code = ["""
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""] * 2
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
def main():
|
||||
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
||||
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
|
||||
@ -180,6 +196,7 @@ def main():
|
||||
add_llm_extraction_strategy(crawler)
|
||||
targeted_extraction(crawler)
|
||||
interactive_extraction(crawler)
|
||||
multiple_scrip(crawler)
|
||||
|
||||
cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")
|
||||
|
||||
|
13
requirements.crawl.txt
Normal file
13
requirements.crawl.txt
Normal file
@ -0,0 +1,13 @@
|
||||
aiohttp
|
||||
aiosqlite
|
||||
bs4
|
||||
fastapi
|
||||
html2text
|
||||
httpx
|
||||
pydantic
|
||||
python-dotenv
|
||||
requests
|
||||
rich
|
||||
selenium
|
||||
uvicorn
|
||||
chromedriver-autoinstaller
|
@ -1,20 +1,20 @@
|
||||
aiohttp==3.9.5
|
||||
aiosqlite==0.20.0
|
||||
bs4==0.0.2
|
||||
fastapi==0.111.0
|
||||
aiohttp
|
||||
aiosqlite
|
||||
bs4
|
||||
fastapi
|
||||
html2text
|
||||
httpx==0.27.0
|
||||
litellm==1.37.11
|
||||
nltk==3.8.1
|
||||
pydantic==2.7.1
|
||||
python-dotenv==1.0.1
|
||||
requests==2.31.0
|
||||
rich==13.7.1
|
||||
scikit-learn==1.4.2
|
||||
selenium==4.20.0
|
||||
uvicorn==0.29.0
|
||||
transformers==4.40.2
|
||||
chromedriver-autoinstaller==0.6.4
|
||||
httpx
|
||||
litellm
|
||||
nltk
|
||||
pydantic
|
||||
python-dotenv
|
||||
requests
|
||||
rich
|
||||
scikit-learn
|
||||
selenium
|
||||
uvicorn
|
||||
transformers
|
||||
chromedriver-autoinstaller
|
||||
torch
|
||||
onnxruntime
|
||||
tokenizers
|
||||
|
9
setup.py
9
setup.py
@ -7,11 +7,16 @@ from setuptools.command.install import install
|
||||
with open("requirements.txt") as f:
|
||||
requirements = f.read().splitlines()
|
||||
|
||||
# Read the requirements from requirements.txt
|
||||
with open("requirements.crawl.txt") as f:
|
||||
requirements_crawl_only = f.read().splitlines()
|
||||
|
||||
# Define the requirements for different environments
|
||||
requirements_without_torch = [req for req in requirements if not req.startswith("torch")]
|
||||
requirements_without_transformers = [req for req in requirements if not req.startswith("transformers")]
|
||||
requirements_without_nltk = [req for req in requirements if not req.startswith("nltk")]
|
||||
requirements_without_torch_transformers_nlkt = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
|
||||
requirements_crawl_only = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
|
||||
|
||||
class CustomInstallCommand(install):
|
||||
"""Customized setuptools install command to install spacy without dependencies."""
|
||||
@ -21,7 +26,7 @@ class CustomInstallCommand(install):
|
||||
|
||||
setup(
|
||||
name="Crawl4AI",
|
||||
version="0.2.1",
|
||||
version="0.2.2",
|
||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
||||
long_description=open("README.md").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
@ -34,7 +39,7 @@ setup(
|
||||
extras_require={
|
||||
"all": requirements, # Include all requirements
|
||||
"colab": requirements_without_torch, # Exclude torch for Colab
|
||||
"crawl": requirements_without_torch_transformers_nlkt
|
||||
"crawl": requirements_crawl_only, # Include only crawl requirements
|
||||
},
|
||||
cmdclass={
|
||||
'install': CustomInstallCommand,
|
||||
|
Loading…
x
Reference in New Issue
Block a user