| 
									
										
										
										
											2022-06-07 18:14:43 +02:00
										 |  |  | from typing import List | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-08-18 17:05:44 +05:00
										 |  |  | import json | 
					
						
							|  |  |  | from pathlib import Path | 
					
						
							| 
									
										
										
										
											2022-07-11 07:16:32 -03:00
										 |  |  | import re | 
					
						
							|  |  |  | import hashlib | 
					
						
							|  |  |  | import os | 
					
						
							| 
									
										
										
										
											2021-08-18 17:05:44 +05:00
										 |  |  | 
 | 
					
						
							|  |  |  | import pytest | 
					
						
							| 
									
										
										
										
											2022-06-24 12:05:32 +02:00
										 |  |  | from selenium.webdriver.common.by import By | 
					
						
							| 
									
										
										
										
											2022-06-06 17:52:37 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-07-01 05:47:33 -03:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-26 18:12:55 +01:00
										 |  |  | from haystack.nodes.connector import Crawler | 
					
						
							| 
									
										
										
										
											2022-03-29 13:53:35 +02:00
										 |  |  | from haystack.schema import Document | 
					
						
							| 
									
										
										
										
											2021-08-18 17:05:44 +05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-06-06 17:52:37 +02:00
										 |  |  | from ..conftest import SAMPLES_PATH | 
					
						
							| 
									
										
										
										
											2021-08-18 17:05:44 +05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-06-06 17:52:37 +02:00
										 |  |  | @pytest.fixture(scope="session") | 
					
						
							|  |  |  | def test_url(): | 
					
						
							|  |  |  |     return f"file://{SAMPLES_PATH.absolute()}/crawler" | 
					
						
							| 
									
										
										
										
											2021-08-18 17:05:44 +05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-06-06 17:52:37 +02:00
										 |  |  | def content_match(crawler: Crawler, url: str, crawled_page: Path): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     :param crawler: the tested Crawler object | 
					
						
							| 
									
										
										
										
											2022-06-07 18:14:43 +02:00
										 |  |  |     :param url: the URL of the expected page | 
					
						
							| 
									
										
										
										
											2022-06-06 17:52:37 +02:00
										 |  |  |     :param crawled_page: the output of Crawler (one element of the paths list) | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     crawler.driver.get(url) | 
					
						
							| 
									
										
										
										
											2022-07-01 05:47:33 -03:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-06-24 12:05:32 +02:00
										 |  |  |     body = crawler.driver.find_element(by=By.TAG_NAME, value="body") | 
					
						
							| 
									
										
										
										
											2022-06-10 09:51:41 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if crawler.extract_hidden_text: | 
					
						
							|  |  |  |         expected_crawled_content = body.get_attribute("textContent") | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         expected_crawled_content = body.text | 
					
						
							| 
									
										
										
										
											2021-08-18 17:05:44 +05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-06-06 17:52:37 +02:00
										 |  |  |     with open(crawled_page, "r") as crawled_file: | 
					
						
							|  |  |  |         page_data = json.load(crawled_file) | 
					
						
							|  |  |  |         return page_data["content"] == expected_crawled_content | 
					
						
							| 
									
										
										
										
											2021-08-18 17:05:44 +05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-06-07 18:14:43 +02:00
										 |  |  | def content_in_results(crawler: Crawler, url: str, results: List[Path], expected_matches_count=1): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Makes sure there is exactly one matching page in the list of pages returned | 
					
						
							|  |  |  |     by the crawler. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     :param crawler: the tested Crawler object | 
					
						
							|  |  |  |     :param url: the URL of the page to find in the results | 
					
						
							|  |  |  |     :param results: the crawler's output (list of paths) | 
					
						
							|  |  |  |     :param expected_matches_count: how many copies of this page should be present in the results (default 1) | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     return sum(content_match(crawler, url, path) for path in results) == expected_matches_count | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-06-06 17:52:37 +02:00
										 |  |  | # | 
					
						
							|  |  |  | # Integration | 
					
						
							|  |  |  | # | 
					
						
							| 
									
										
										
										
											2021-08-18 17:05:44 +05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-06-06 17:52:37 +02:00
										 |  |  | @pytest.mark.integration | 
					
						
							|  |  |  | def test_crawler(tmp_path): | 
					
						
							|  |  |  |     tmp_dir = tmp_path | 
					
						
							|  |  |  |     url = ["https://haystack.deepset.ai/"] | 
					
						
							| 
									
										
										
										
											2021-08-18 17:05:44 +05:00
										 |  |  | 
 | 
					
						
							|  |  |  |     crawler = Crawler(output_dir=tmp_dir) | 
					
						
							| 
									
										
										
										
											2022-06-06 17:52:37 +02:00
										 |  |  |     docs_path = crawler.crawl(urls=url, crawler_depth=0) | 
					
						
							|  |  |  |     results, _ = crawler.run(urls=url, crawler_depth=0, return_documents=True) | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  |     documents = results["documents"] | 
					
						
							| 
									
										
										
										
											2021-08-24 17:25:22 +05:00
										 |  |  | 
 | 
					
						
							|  |  |  |     for json_file, document in zip(docs_path, documents): | 
					
						
							|  |  |  |         assert isinstance(json_file, Path) | 
					
						
							| 
									
										
										
										
											2022-03-29 13:53:35 +02:00
										 |  |  |         assert isinstance(document, Document) | 
					
						
							| 
									
										
										
										
											2021-08-24 17:25:22 +05:00
										 |  |  | 
 | 
					
						
							|  |  |  |         with open(json_file.absolute(), "r") as read_file: | 
					
						
							|  |  |  |             file_content = json.load(read_file) | 
					
						
							| 
									
										
										
										
											2022-03-29 13:53:35 +02:00
										 |  |  |             assert file_content["meta"] == document.meta | 
					
						
							|  |  |  |             assert file_content["content"] == document.content | 
					
						
							| 
									
										
										
										
											2022-06-06 17:52:37 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Unit tests | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_crawler_url_none_exception(tmp_path): | 
					
						
							|  |  |  |     crawler = Crawler(tmp_path) | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         crawler.crawl() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_crawler_depth_0_single_url(test_url, tmp_path): | 
					
						
							|  |  |  |     crawler = Crawler(output_dir=tmp_path) | 
					
						
							|  |  |  |     paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0) | 
					
						
							|  |  |  |     assert len(paths) == 1 | 
					
						
							|  |  |  |     assert content_match(crawler, test_url + "/index.html", paths[0]) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_crawler_depth_0_many_urls(test_url, tmp_path): | 
					
						
							|  |  |  |     crawler = Crawler(output_dir=tmp_path) | 
					
						
							|  |  |  |     _urls = [test_url + "/index.html", test_url + "/page1.html"] | 
					
						
							|  |  |  |     paths = crawler.crawl(urls=_urls, crawler_depth=0) | 
					
						
							|  |  |  |     assert len(paths) == 2 | 
					
						
							| 
									
										
										
										
											2022-06-07 18:14:43 +02:00
										 |  |  |     assert content_in_results(crawler, test_url + "/index.html", paths) | 
					
						
							|  |  |  |     assert content_in_results(crawler, test_url + "/page1.html", paths) | 
					
						
							| 
									
										
										
										
											2022-06-06 17:52:37 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_crawler_depth_1_single_url(test_url, tmp_path): | 
					
						
							|  |  |  |     crawler = Crawler(output_dir=tmp_path) | 
					
						
							|  |  |  |     paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1) | 
					
						
							|  |  |  |     assert len(paths) == 3 | 
					
						
							| 
									
										
										
										
											2022-06-07 18:14:43 +02:00
										 |  |  |     assert content_in_results(crawler, test_url + "/index.html", paths) | 
					
						
							|  |  |  |     assert content_in_results(crawler, test_url + "/page1.html", paths) | 
					
						
							|  |  |  |     assert content_in_results(crawler, test_url + "/page2.html", paths) | 
					
						
							| 
									
										
										
										
											2022-06-06 17:52:37 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_crawler_output_file_structure(test_url, tmp_path): | 
					
						
							|  |  |  |     crawler = Crawler(output_dir=tmp_path) | 
					
						
							|  |  |  |     paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0) | 
					
						
							|  |  |  |     assert content_match(crawler, test_url + "/index.html", paths[0]) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with open(paths[0].absolute(), "r") as doc_file: | 
					
						
							|  |  |  |         data = json.load(doc_file) | 
					
						
							|  |  |  |         assert "content" in data | 
					
						
							|  |  |  |         assert "meta" in data | 
					
						
							|  |  |  |         assert isinstance(data["content"], str) | 
					
						
							|  |  |  |         assert len(data["content"].split()) > 2 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_crawler_filter_urls(test_url, tmp_path): | 
					
						
							|  |  |  |     crawler = Crawler(output_dir=tmp_path) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["index"], crawler_depth=1) | 
					
						
							|  |  |  |     assert len(paths) == 1 | 
					
						
							|  |  |  |     assert content_match(crawler, test_url + "/index.html", paths[0]) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Note: filter_urls can exclude pages listed in `urls` as well | 
					
						
							|  |  |  |     paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1) | 
					
						
							|  |  |  |     assert len(paths) == 1 | 
					
						
							|  |  |  |     assert content_match(crawler, test_url + "/page1.html", paths[0]) | 
					
						
							| 
									
										
										
										
											2022-07-01 05:47:33 -03:00
										 |  |  |     assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google.com"], crawler_depth=1) | 
					
						
							| 
									
										
										
										
											2022-06-06 17:52:37 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_crawler_return_document(test_url, tmp_path): | 
					
						
							|  |  |  |     crawler = Crawler(output_dir=tmp_path) | 
					
						
							|  |  |  |     documents, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=True) | 
					
						
							|  |  |  |     paths, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=False) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for path, document in zip(paths["paths"], documents["documents"]): | 
					
						
							|  |  |  |         with open(path.absolute(), "r") as doc_file: | 
					
						
							|  |  |  |             file_content = json.load(doc_file) | 
					
						
							|  |  |  |             assert file_content["meta"] == document.meta | 
					
						
							|  |  |  |             assert file_content["content"] == document.content | 
					
						
							| 
									
										
										
										
											2022-06-10 09:51:41 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_crawler_extract_hidden_text(test_url, tmp_path): | 
					
						
							|  |  |  |     crawler = Crawler(output_dir=tmp_path) | 
					
						
							|  |  |  |     documents, _ = crawler.run( | 
					
						
							|  |  |  |         urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=True, crawler_depth=0, return_documents=True | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     crawled_content = documents["documents"][0].content | 
					
						
							|  |  |  |     assert "hidden text" in crawled_content | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     documents, _ = crawler.run( | 
					
						
							|  |  |  |         urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=False, crawler_depth=0, return_documents=True | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     crawled_content = documents["documents"][0].content | 
					
						
							|  |  |  |     assert "hidden text" not in crawled_content | 
					
						
							| 
									
										
										
										
											2022-07-01 05:47:33 -03:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_crawler_loading_wait_time(test_url, tmp_path): | 
					
						
							|  |  |  |     loading_wait_time = 3 | 
					
						
							|  |  |  |     crawler = Crawler(output_dir=tmp_path) | 
					
						
							|  |  |  |     paths = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=1, loading_wait_time=loading_wait_time) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert len(paths) == 4 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with open(f"{SAMPLES_PATH.absolute()}/crawler/page_dynamic_result.txt", "r") as dynamic_result: | 
					
						
							|  |  |  |         dynamic_result_text = dynamic_result.read() | 
					
						
							|  |  |  |         for path in paths: | 
					
						
							|  |  |  |             with open(path, "r") as crawled_file: | 
					
						
							|  |  |  |                 page_data = json.load(crawled_file) | 
					
						
							|  |  |  |                 if page_data["meta"]["url"] == test_url + "/page_dynamic.html": | 
					
						
							|  |  |  |                     assert dynamic_result_text == page_data["content"] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert content_in_results(crawler, test_url + "/index.html", paths) | 
					
						
							|  |  |  |     assert content_in_results(crawler, test_url + "/page1.html", paths) | 
					
						
							|  |  |  |     assert content_in_results(crawler, test_url + "/page2.html", paths) | 
					
						
							| 
									
										
										
										
											2022-07-11 07:16:32 -03:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_crawler_default_naming_function(test_url, tmp_path): | 
					
						
							|  |  |  |     crawler = Crawler(output_dir=tmp_path) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     link = f"{test_url}/page_with_a_very_long_name_to_do_some_tests_Now_let's_add_some_text_just_to_pass_the_129_chars_mark_and_trigger_the_chars_limit_of_the_default_naming_function.html" | 
					
						
							|  |  |  |     file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link[:129]) | 
					
						
							|  |  |  |     file_name_hash = hashlib.md5(f"{link}".encode("utf-8")).hexdigest() | 
					
						
							|  |  |  |     expected_crawled_file_path = f"{tmp_path}/{file_name_link}_{file_name_hash[-6:]}.json" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     paths = crawler.crawl(urls=[link], crawler_depth=0) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert os.path.exists(paths[0]) | 
					
						
							|  |  |  |     assert paths[0] == Path(expected_crawled_file_path) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_crawler_naming_function(test_url, tmp_path): | 
					
						
							|  |  |  |     crawler = Crawler( | 
					
						
							|  |  |  |         output_dir=tmp_path, crawler_naming_function=lambda link, text: re.sub("[<>:'/\\|?*\0 ]", "_", link) | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     link = f"{test_url}/page_dynamic.html" | 
					
						
							|  |  |  |     file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link) | 
					
						
							|  |  |  |     expected_crawled_file_path = tmp_path / f"{file_name_link}.json" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     paths = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=0) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert os.path.exists(paths[0]) | 
					
						
							|  |  |  |     assert paths[0] == expected_crawled_file_path |