Make crawler extract also hidden text (#2642)

* make crawler extract also hidden text * Update Documentation & Code Style * try to adapt test for extract_hidden_text * Update Documentation & Code Style * fix test bug * fix bug in test * added test for hidden text" * Update Documentation & Code Style * fix bug in test * Update Documentation & Code Style * fix test * Update Documentation & Code Style * fix other test bug Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2025-12-19 02:59:48 +00:00 · 2022-06-10 09:51:41 +02:00 · 2022-06-10 09:51:41 +02:00 · c178f60e3a
commit c178f60e3a
parent c8f9e1b76c
5 changed files with 73 additions and 9 deletions
--- a/docs/_src/api/api/crawler.md
+++ b/docs/_src/api/api/crawler.md
@ -27,7 +27,7 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
 #### Crawler.\_\_init\_\_
 ```python
-def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None)
+def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True)
 ```
 Init object with basic params for crawling (can be overwritten later).
@ -46,13 +46,15 @@ All URLs not matching at least one of the regular expressions will be dropped.
 attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
 not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
 In this case the id will be generated by using the content and the defined metadata.
 - `extract_hidden_text`: Whether to extract the hidden text contained in page.
 E.g. the text can be inside a span with style="display: none"
 <a id="crawler.Crawler.crawl"></a>
 #### Crawler.crawl
 ```python
-def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None) -> List[Path]
+def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None) -> List[Path]
 ```
 Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
@ -86,7 +88,7 @@ List of paths where the crawled webpages got stored
 #### Crawler.run
 ```python
-def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None) -> Tuple[Dict, str]
+def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True) -> Tuple[Dict, str]
 ```
 Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
@ -106,6 +108,8 @@ All URLs not matching at least one of the regular expressions will be dropped.
 attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
 not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
 In this case the id will be generated by using the content and the defined metadata.
 - `extract_hidden_text`: Whether to extract the hidden text contained in page.
 E.g. the text can be inside a span with style="display: none"
 **Returns**:
--- a/haystack/json-schemas/haystack-pipeline-master.schema.json
+++ b/haystack/json-schemas/haystack-pipeline-master.schema.json
@ -1879,6 +1879,10 @@
              "items": {
                "type": "string"
              }
            },
            "extract_hidden_text": {
              "title": "Extract Hidden Text",
              "default": true
            }
          },
          "required": [
--- a/haystack/nodes/connector/crawler.py
+++ b/haystack/nodes/connector/crawler.py
@ -47,6 +47,7 @@ class Crawler(BaseComponent):
        filter_urls: Optional[List] = None,
        overwrite_existing_files=True,
        id_hash_keys: Optional[List[str]] = None,
        extract_hidden_text=True,
    ):
        """
        Init object with basic params for crawling (can be overwritten later).
@ -63,6 +64,8 @@ class Crawler(BaseComponent):
            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
            In this case the id will be generated by using the content and the defined metadata.
        :param extract_hidden_text: Whether to extract the hidden text contained in page.
            E.g. the text can be inside a span with style="display: none"
        """
        super().__init__()
@ -93,6 +96,7 @@ class Crawler(BaseComponent):
        self.filter_urls = filter_urls
        self.overwrite_existing_files = overwrite_existing_files
        self.id_hash_keys = id_hash_keys
        self.extract_hidden_text = extract_hidden_text
    def crawl(
        self,
@ -102,6 +106,7 @@ class Crawler(BaseComponent):
        filter_urls: Optional[List] = None,
        overwrite_existing_files: Optional[bool] = None,
        id_hash_keys: Optional[List[str]] = None,
        extract_hidden_text: Optional[bool] = None,
    ) -> List[Path]:
        """
        Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
@ -138,6 +143,8 @@ class Crawler(BaseComponent):
            overwrite_existing_files = self.overwrite_existing_files
        if crawler_depth is None:
            crawler_depth = self.crawler_depth
        if extract_hidden_text is None:
            extract_hidden_text = self.extract_hidden_text
        output_dir = Path(output_dir)
        if not output_dir.exists():
@ -155,9 +162,11 @@ class Crawler(BaseComponent):
                pattern = re.compile("|".join(filter_urls))
                for url in urls:
                    if pattern.search(url):
-                        file_paths += self._write_to_files([url], output_dir=output_dir)
+                        file_paths += self._write_to_files(
                            [url], output_dir=output_dir, extract_hidden_text=extract_hidden_text
                        )
            else:
-                file_paths += self._write_to_files(urls, output_dir=output_dir)
+                file_paths += self._write_to_files(urls, output_dir=output_dir, extract_hidden_text=extract_hidden_text)
            # follow one level of sublinks if requested
            if crawler_depth == 1:
                sub_links: Dict[str, List] = {}
@ -170,19 +179,31 @@ class Crawler(BaseComponent):
                    )
                for url, extracted_sublink in sub_links.items():
                    file_paths += self._write_to_files(
-                        extracted_sublink, output_dir=output_dir, base_url=url, id_hash_keys=id_hash_keys
+                        extracted_sublink,
                        output_dir=output_dir,
                        base_url=url,
                        id_hash_keys=id_hash_keys,
                        extract_hidden_text=extract_hidden_text,
                    )
        return file_paths
    def _write_to_files(
-        self, urls: List[str], output_dir: Path, base_url: str = None, id_hash_keys: Optional[List[str]] = None
+        self,
        urls: List[str],
        output_dir: Path,
        extract_hidden_text: bool,
        base_url: str = None,
        id_hash_keys: Optional[List[str]] = None,
    ) -> List[Path]:
        paths = []
        for link in urls:
            logger.info(f"writing contents from `{link}`")
            self.driver.get(link)
            el = self.driver.find_element_by_tag_name("body")
            if extract_hidden_text:
                text = el.get_attribute("textContent")
            else:
                text = el.text
            link_split_values = link.replace("https://", "").split("/")
@ -210,6 +231,7 @@ class Crawler(BaseComponent):
        overwrite_existing_files: Optional[bool] = None,
        return_documents: Optional[bool] = False,
        id_hash_keys: Optional[List[str]] = None,
        extract_hidden_text: Optional[bool] = True,
    ) -> Tuple[Dict, str]:
        """
        Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
@ -227,6 +249,8 @@ class Crawler(BaseComponent):
            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
            In this case the id will be generated by using the content and the defined metadata.
        :param extract_hidden_text: Whether to extract the hidden text contained in page.
            E.g. the text can be inside a span with style="display: none"
        :return: Tuple({"paths": List of filepaths, ...}, Name of output edge)
        """
@ -237,6 +261,7 @@ class Crawler(BaseComponent):
            crawler_depth=crawler_depth,
            filter_urls=filter_urls,
            overwrite_existing_files=overwrite_existing_files,
            extract_hidden_text=extract_hidden_text,
        )
        if return_documents:
            crawled_data = []
@ -258,6 +283,7 @@ class Crawler(BaseComponent):
        overwrite_existing_files: Optional[bool] = None,
        return_documents: Optional[bool] = False,
        id_hash_keys: Optional[List[str]] = None,
        extract_hidden_text: Optional[bool] = True,
    ):
        return self.run(
            output_dir=output_dir,
@ -267,6 +293,7 @@ class Crawler(BaseComponent):
            overwrite_existing_files=overwrite_existing_files,
            return_documents=return_documents,
            id_hash_keys=id_hash_keys,
            extract_hidden_text=extract_hidden_text,
        )
    @staticmethod
--- a/test/nodes/test_connector.py
+++ b/test/nodes/test_connector.py
@ -24,6 +24,10 @@ def content_match(crawler: Crawler, url: str, crawled_page: Path):
    """
    crawler.driver.get(url)
    body = crawler.driver.find_element_by_tag_name("body")
    if crawler.extract_hidden_text:
        expected_crawled_content = body.get_attribute("textContent")
    else:
        expected_crawled_content = body.text
    with open(crawled_page, "r") as crawled_file:
@ -142,3 +146,18 @@ def test_crawler_return_document(test_url, tmp_path):
            file_content = json.load(doc_file)
            assert file_content["meta"] == document.meta
            assert file_content["content"] == document.content
 def test_crawler_extract_hidden_text(test_url, tmp_path):
    crawler = Crawler(output_dir=tmp_path)
    documents, _ = crawler.run(
        urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=True, crawler_depth=0, return_documents=True
    )
    crawled_content = documents["documents"][0].content
    assert "hidden text" in crawled_content
    documents, _ = crawler.run(
        urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=False, crawler_depth=0, return_documents=True
    )
    crawled_content = documents["documents"][0].content
    assert "hidden text" not in crawled_content
--- a/test/samples/crawler/page_w_hidden_text.html
+++ b/test/samples/crawler/page_w_hidden_text.html
@ -0,0 +1,10 @@
 <!DOCTYPE html>
 <html>
 <head>
    <title>Test Page</title>
 </head>
 <body>
    <p>visible text</p>
    <p style="display: none">hidden text</p>
 </body>
 </html>