2024-05-14 21:27:41 +08:00
|
|
|
import unittest, os
|
|
|
|
from crawl4ai.web_crawler import WebCrawler
|
2025-01-13 19:19:58 +08:00
|
|
|
from crawl4ai.chunking_strategy import (
|
|
|
|
RegexChunking,
|
|
|
|
FixedLengthWordChunking,
|
|
|
|
SlidingWindowChunking,
|
|
|
|
)
|
|
|
|
from crawl4ai.extraction_strategy import (
|
|
|
|
CosineStrategy,
|
|
|
|
LLMExtractionStrategy,
|
|
|
|
TopicExtractionStrategy,
|
|
|
|
NoExtractionStrategy,
|
|
|
|
)
|
|
|
|
|
2024-05-14 21:27:41 +08:00
|
|
|
|
|
|
|
class TestWebCrawler(unittest.TestCase):
|
|
|
|
def setUp(self):
|
|
|
|
self.crawler = WebCrawler()
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-05-14 21:27:41 +08:00
|
|
|
def test_warmup(self):
|
|
|
|
self.crawler.warmup()
|
|
|
|
self.assertTrue(self.crawler.ready, "WebCrawler failed to warm up")
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-05-14 21:27:41 +08:00
|
|
|
def test_run_default_strategies(self):
|
|
|
|
result = self.crawler.run(
|
2025-01-13 19:19:58 +08:00
|
|
|
url="https://www.nbcnews.com/business",
|
2024-05-14 21:27:41 +08:00
|
|
|
word_count_threshold=5,
|
|
|
|
chunking_strategy=RegexChunking(),
|
2025-01-13 19:19:58 +08:00
|
|
|
extraction_strategy=CosineStrategy(),
|
|
|
|
bypass_cache=True,
|
2024-05-14 21:27:41 +08:00
|
|
|
)
|
2025-01-13 19:19:58 +08:00
|
|
|
self.assertTrue(
|
|
|
|
result.success, "Failed to crawl and extract using default strategies"
|
|
|
|
)
|
|
|
|
|
2024-05-14 21:27:41 +08:00
|
|
|
def test_run_different_strategies(self):
|
2025-01-13 19:19:58 +08:00
|
|
|
url = "https://www.nbcnews.com/business"
|
|
|
|
|
2024-05-14 21:27:41 +08:00
|
|
|
# Test with FixedLengthWordChunking and LLMExtractionStrategy
|
|
|
|
result = self.crawler.run(
|
|
|
|
url=url,
|
|
|
|
word_count_threshold=5,
|
|
|
|
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
|
2025-01-13 19:19:58 +08:00
|
|
|
extraction_strategy=LLMExtractionStrategy(
|
|
|
|
provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY")
|
|
|
|
),
|
|
|
|
bypass_cache=True,
|
2024-05-14 21:27:41 +08:00
|
|
|
)
|
2025-01-13 19:19:58 +08:00
|
|
|
self.assertTrue(
|
|
|
|
result.success,
|
|
|
|
"Failed to crawl and extract with FixedLengthWordChunking and LLMExtractionStrategy",
|
|
|
|
)
|
|
|
|
|
2024-05-14 21:27:41 +08:00
|
|
|
# Test with SlidingWindowChunking and TopicExtractionStrategy
|
|
|
|
result = self.crawler.run(
|
|
|
|
url=url,
|
|
|
|
word_count_threshold=5,
|
|
|
|
chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
|
2025-01-13 19:19:58 +08:00
|
|
|
extraction_strategy=TopicExtractionStrategy(num_keywords=5),
|
|
|
|
bypass_cache=True,
|
|
|
|
)
|
|
|
|
self.assertTrue(
|
|
|
|
result.success,
|
|
|
|
"Failed to crawl and extract with SlidingWindowChunking and TopicExtractionStrategy",
|
2024-05-14 21:27:41 +08:00
|
|
|
)
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-05-14 21:27:41 +08:00
|
|
|
def test_invalid_url(self):
|
|
|
|
with self.assertRaises(Exception) as context:
|
2025-01-13 19:19:58 +08:00
|
|
|
self.crawler.run(url="invalid_url", bypass_cache=True)
|
2024-05-14 21:27:41 +08:00
|
|
|
self.assertIn("Invalid URL", str(context.exception))
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-05-14 21:27:41 +08:00
|
|
|
def test_unsupported_extraction_strategy(self):
|
|
|
|
with self.assertRaises(Exception) as context:
|
2025-01-13 19:19:58 +08:00
|
|
|
self.crawler.run(
|
|
|
|
url="https://www.nbcnews.com/business",
|
|
|
|
extraction_strategy="UnsupportedStrategy",
|
|
|
|
bypass_cache=True,
|
|
|
|
)
|
2024-05-14 21:27:41 +08:00
|
|
|
self.assertIn("Unsupported extraction strategy", str(context.exception))
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-05-14 21:27:41 +08:00
|
|
|
def test_invalid_css_selector(self):
|
|
|
|
with self.assertRaises(ValueError) as context:
|
2025-01-13 19:19:58 +08:00
|
|
|
self.crawler.run(
|
|
|
|
url="https://www.nbcnews.com/business",
|
|
|
|
css_selector="invalid_selector",
|
|
|
|
bypass_cache=True,
|
|
|
|
)
|
2024-05-14 21:27:41 +08:00
|
|
|
self.assertIn("Invalid CSS selector", str(context.exception))
|
|
|
|
|
|
|
|
def test_crawl_with_cache_and_bypass_cache(self):
|
2025-01-13 19:19:58 +08:00
|
|
|
url = "https://www.nbcnews.com/business"
|
|
|
|
|
2024-05-14 21:27:41 +08:00
|
|
|
# First crawl with cache enabled
|
|
|
|
result = self.crawler.run(url=url, bypass_cache=False)
|
|
|
|
self.assertTrue(result.success, "Failed to crawl and cache the result")
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-05-14 21:27:41 +08:00
|
|
|
# Second crawl with bypass_cache=True
|
|
|
|
result = self.crawler.run(url=url, bypass_cache=True)
|
|
|
|
self.assertTrue(result.success, "Failed to bypass cache and fetch fresh data")
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-05-14 21:27:41 +08:00
|
|
|
def test_fetch_multiple_pages(self):
|
2025-01-13 19:19:58 +08:00
|
|
|
urls = ["https://www.nbcnews.com/business", "https://www.bbc.com/news"]
|
2024-05-14 21:27:41 +08:00
|
|
|
results = []
|
|
|
|
for url in urls:
|
|
|
|
result = self.crawler.run(
|
|
|
|
url=url,
|
|
|
|
word_count_threshold=5,
|
|
|
|
chunking_strategy=RegexChunking(),
|
|
|
|
extraction_strategy=CosineStrategy(),
|
2025-01-13 19:19:58 +08:00
|
|
|
bypass_cache=True,
|
2024-05-14 21:27:41 +08:00
|
|
|
)
|
|
|
|
results.append(result)
|
2025-01-13 19:19:58 +08:00
|
|
|
|
2024-05-14 21:27:41 +08:00
|
|
|
self.assertEqual(len(results), 2, "Failed to crawl and extract multiple pages")
|
|
|
|
for result in results:
|
2025-01-13 19:19:58 +08:00
|
|
|
self.assertTrue(
|
|
|
|
result.success, "Failed to crawl and extract a page in the list"
|
|
|
|
)
|
|
|
|
|
2024-05-14 21:27:41 +08:00
|
|
|
def test_run_fixed_length_word_chunking_and_no_extraction(self):
|
|
|
|
result = self.crawler.run(
|
2025-01-13 19:19:58 +08:00
|
|
|
url="https://www.nbcnews.com/business",
|
2024-05-14 21:27:41 +08:00
|
|
|
word_count_threshold=5,
|
|
|
|
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
|
2025-01-13 19:19:58 +08:00
|
|
|
extraction_strategy=NoExtractionStrategy(),
|
|
|
|
bypass_cache=True,
|
|
|
|
)
|
|
|
|
self.assertTrue(
|
|
|
|
result.success,
|
|
|
|
"Failed to crawl and extract with FixedLengthWordChunking and NoExtractionStrategy",
|
2024-05-14 21:27:41 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
def test_run_sliding_window_and_no_extraction(self):
|
|
|
|
result = self.crawler.run(
|
2025-01-13 19:19:58 +08:00
|
|
|
url="https://www.nbcnews.com/business",
|
2024-05-14 21:27:41 +08:00
|
|
|
word_count_threshold=5,
|
|
|
|
chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
|
2025-01-13 19:19:58 +08:00
|
|
|
extraction_strategy=NoExtractionStrategy(),
|
|
|
|
bypass_cache=True,
|
2024-05-14 21:27:41 +08:00
|
|
|
)
|
2025-01-13 19:19:58 +08:00
|
|
|
self.assertTrue(
|
|
|
|
result.success,
|
|
|
|
"Failed to crawl and extract with SlidingWindowChunking and NoExtractionStrategy",
|
|
|
|
)
|
|
|
|
|
2024-05-14 21:27:41 +08:00
|
|
|
|
2025-01-13 19:19:58 +08:00
|
|
|
if __name__ == "__main__":
|
2024-05-14 21:27:41 +08:00
|
|
|
unittest.main()
|