diff --git a/docs/_src/api/api/crawler.md b/docs/_src/api/api/crawler.md index 05e518aa3..65660cfb1 100644 --- a/docs/_src/api/api/crawler.md +++ b/docs/_src/api/api/crawler.md @@ -27,7 +27,7 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus #### Crawler.\_\_init\_\_ ```python -def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None) +def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True) ``` Init object with basic params for crawling (can be overwritten later). @@ -46,13 +46,15 @@ All URLs not matching at least one of the regular expressions will be dropped. attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). In this case the id will be generated by using the content and the defined metadata. +- `extract_hidden_text`: Whether to extract the hidden text contained in page. +E.g. the text can be inside a span with style="display: none" #### Crawler.crawl ```python -def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None) -> List[Path] +def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None) -> List[Path] ``` Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON @@ -86,7 +88,7 @@ List of paths where the crawled webpages got stored #### Crawler.run ```python -def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None) -> Tuple[Dict, str] +def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True) -> Tuple[Dict, str] ``` Method to be executed when the Crawler is used as a Node within a Haystack pipeline. @@ -106,6 +108,8 @@ All URLs not matching at least one of the regular expressions will be dropped. attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). In this case the id will be generated by using the content and the defined metadata. +- `extract_hidden_text`: Whether to extract the hidden text contained in page. +E.g. the text can be inside a span with style="display: none" **Returns**: diff --git a/haystack/json-schemas/haystack-pipeline-master.schema.json b/haystack/json-schemas/haystack-pipeline-master.schema.json index 8ea55af33..4cacfe7d2 100644 --- a/haystack/json-schemas/haystack-pipeline-master.schema.json +++ b/haystack/json-schemas/haystack-pipeline-master.schema.json @@ -1879,6 +1879,10 @@ "items": { "type": "string" } + }, + "extract_hidden_text": { + "title": "Extract Hidden Text", + "default": true } }, "required": [ diff --git a/haystack/nodes/connector/crawler.py b/haystack/nodes/connector/crawler.py index f57eee7f4..0c10d98df 100644 --- a/haystack/nodes/connector/crawler.py +++ b/haystack/nodes/connector/crawler.py @@ -47,6 +47,7 @@ class Crawler(BaseComponent): filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, + extract_hidden_text=True, ): """ Init object with basic params for crawling (can be overwritten later). @@ -63,6 +64,8 @@ class Crawler(BaseComponent): attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). In this case the id will be generated by using the content and the defined metadata. + :param extract_hidden_text: Whether to extract the hidden text contained in page. + E.g. the text can be inside a span with style="display: none" """ super().__init__() @@ -93,6 +96,7 @@ class Crawler(BaseComponent): self.filter_urls = filter_urls self.overwrite_existing_files = overwrite_existing_files self.id_hash_keys = id_hash_keys + self.extract_hidden_text = extract_hidden_text def crawl( self, @@ -102,6 +106,7 @@ class Crawler(BaseComponent): filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, + extract_hidden_text: Optional[bool] = None, ) -> List[Path]: """ Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON @@ -138,6 +143,8 @@ class Crawler(BaseComponent): overwrite_existing_files = self.overwrite_existing_files if crawler_depth is None: crawler_depth = self.crawler_depth + if extract_hidden_text is None: + extract_hidden_text = self.extract_hidden_text output_dir = Path(output_dir) if not output_dir.exists(): @@ -155,9 +162,11 @@ class Crawler(BaseComponent): pattern = re.compile("|".join(filter_urls)) for url in urls: if pattern.search(url): - file_paths += self._write_to_files([url], output_dir=output_dir) + file_paths += self._write_to_files( + [url], output_dir=output_dir, extract_hidden_text=extract_hidden_text + ) else: - file_paths += self._write_to_files(urls, output_dir=output_dir) + file_paths += self._write_to_files(urls, output_dir=output_dir, extract_hidden_text=extract_hidden_text) # follow one level of sublinks if requested if crawler_depth == 1: sub_links: Dict[str, List] = {} @@ -170,20 +179,32 @@ class Crawler(BaseComponent): ) for url, extracted_sublink in sub_links.items(): file_paths += self._write_to_files( - extracted_sublink, output_dir=output_dir, base_url=url, id_hash_keys=id_hash_keys + extracted_sublink, + output_dir=output_dir, + base_url=url, + id_hash_keys=id_hash_keys, + extract_hidden_text=extract_hidden_text, ) return file_paths def _write_to_files( - self, urls: List[str], output_dir: Path, base_url: str = None, id_hash_keys: Optional[List[str]] = None + self, + urls: List[str], + output_dir: Path, + extract_hidden_text: bool, + base_url: str = None, + id_hash_keys: Optional[List[str]] = None, ) -> List[Path]: paths = [] for link in urls: logger.info(f"writing contents from `{link}`") self.driver.get(link) el = self.driver.find_element_by_tag_name("body") - text = el.text + if extract_hidden_text: + text = el.get_attribute("textContent") + else: + text = el.text link_split_values = link.replace("https://", "").split("/") file_name = f"{'_'.join(link_split_values)}.json" @@ -210,6 +231,7 @@ class Crawler(BaseComponent): overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, + extract_hidden_text: Optional[bool] = True, ) -> Tuple[Dict, str]: """ Method to be executed when the Crawler is used as a Node within a Haystack pipeline. @@ -227,6 +249,8 @@ class Crawler(BaseComponent): attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). In this case the id will be generated by using the content and the defined metadata. + :param extract_hidden_text: Whether to extract the hidden text contained in page. + E.g. the text can be inside a span with style="display: none" :return: Tuple({"paths": List of filepaths, ...}, Name of output edge) """ @@ -237,6 +261,7 @@ class Crawler(BaseComponent): crawler_depth=crawler_depth, filter_urls=filter_urls, overwrite_existing_files=overwrite_existing_files, + extract_hidden_text=extract_hidden_text, ) if return_documents: crawled_data = [] @@ -258,6 +283,7 @@ class Crawler(BaseComponent): overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, + extract_hidden_text: Optional[bool] = True, ): return self.run( output_dir=output_dir, @@ -267,6 +293,7 @@ class Crawler(BaseComponent): overwrite_existing_files=overwrite_existing_files, return_documents=return_documents, id_hash_keys=id_hash_keys, + extract_hidden_text=extract_hidden_text, ) @staticmethod diff --git a/test/nodes/test_connector.py b/test/nodes/test_connector.py index 66b267203..382c1e9be 100644 --- a/test/nodes/test_connector.py +++ b/test/nodes/test_connector.py @@ -24,7 +24,11 @@ def content_match(crawler: Crawler, url: str, crawled_page: Path): """ crawler.driver.get(url) body = crawler.driver.find_element_by_tag_name("body") - expected_crawled_content = body.text + + if crawler.extract_hidden_text: + expected_crawled_content = body.get_attribute("textContent") + else: + expected_crawled_content = body.text with open(crawled_page, "r") as crawled_file: page_data = json.load(crawled_file) @@ -142,3 +146,18 @@ def test_crawler_return_document(test_url, tmp_path): file_content = json.load(doc_file) assert file_content["meta"] == document.meta assert file_content["content"] == document.content + + +def test_crawler_extract_hidden_text(test_url, tmp_path): + crawler = Crawler(output_dir=tmp_path) + documents, _ = crawler.run( + urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=True, crawler_depth=0, return_documents=True + ) + crawled_content = documents["documents"][0].content + assert "hidden text" in crawled_content + + documents, _ = crawler.run( + urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=False, crawler_depth=0, return_documents=True + ) + crawled_content = documents["documents"][0].content + assert "hidden text" not in crawled_content diff --git a/test/samples/crawler/page_w_hidden_text.html b/test/samples/crawler/page_w_hidden_text.html new file mode 100644 index 000000000..f1455ed4d --- /dev/null +++ b/test/samples/crawler/page_w_hidden_text.html @@ -0,0 +1,10 @@ + + + + Test Page + + +

visible text

+

hidden text

+ + \ No newline at end of file