Make crawler extract also hidden text (#2642)

* make crawler extract also hidden text

* Update Documentation & Code Style

* try to adapt test for extract_hidden_text

* Update Documentation & Code Style

* fix test bug

* fix bug in test

* added test for hidden text"

* Update Documentation & Code Style

* fix bug in test

* Update Documentation & Code Style

* fix test

* Update Documentation & Code Style

* fix other test bug

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
Stefano Fiorucci 2022-06-10 09:51:41 +02:00 committed by GitHub
parent c8f9e1b76c
commit c178f60e3a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 73 additions and 9 deletions

View File

@ -27,7 +27,7 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
#### Crawler.\_\_init\_\_
```python
def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None)
def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True)
```
Init object with basic params for crawling (can be overwritten later).
@ -46,13 +46,15 @@ All URLs not matching at least one of the regular expressions will be dropped.
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
- `extract_hidden_text`: Whether to extract the hidden text contained in page.
E.g. the text can be inside a span with style="display: none"
<a id="crawler.Crawler.crawl"></a>
#### Crawler.crawl
```python
def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None) -> List[Path]
def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None) -> List[Path]
```
Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
@ -86,7 +88,7 @@ List of paths where the crawled webpages got stored
#### Crawler.run
```python
def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None) -> Tuple[Dict, str]
def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True) -> Tuple[Dict, str]
```
Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
@ -106,6 +108,8 @@ All URLs not matching at least one of the regular expressions will be dropped.
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
- `extract_hidden_text`: Whether to extract the hidden text contained in page.
E.g. the text can be inside a span with style="display: none"
**Returns**:

View File

@ -1879,6 +1879,10 @@
"items": {
"type": "string"
}
},
"extract_hidden_text": {
"title": "Extract Hidden Text",
"default": true
}
},
"required": [

View File

@ -47,6 +47,7 @@ class Crawler(BaseComponent):
filter_urls: Optional[List] = None,
overwrite_existing_files=True,
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text=True,
):
"""
Init object with basic params for crawling (can be overwritten later).
@ -63,6 +64,8 @@ class Crawler(BaseComponent):
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
:param extract_hidden_text: Whether to extract the hidden text contained in page.
E.g. the text can be inside a span with style="display: none"
"""
super().__init__()
@ -93,6 +96,7 @@ class Crawler(BaseComponent):
self.filter_urls = filter_urls
self.overwrite_existing_files = overwrite_existing_files
self.id_hash_keys = id_hash_keys
self.extract_hidden_text = extract_hidden_text
def crawl(
self,
@ -102,6 +106,7 @@ class Crawler(BaseComponent):
filter_urls: Optional[List] = None,
overwrite_existing_files: Optional[bool] = None,
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = None,
) -> List[Path]:
"""
Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
@ -138,6 +143,8 @@ class Crawler(BaseComponent):
overwrite_existing_files = self.overwrite_existing_files
if crawler_depth is None:
crawler_depth = self.crawler_depth
if extract_hidden_text is None:
extract_hidden_text = self.extract_hidden_text
output_dir = Path(output_dir)
if not output_dir.exists():
@ -155,9 +162,11 @@ class Crawler(BaseComponent):
pattern = re.compile("|".join(filter_urls))
for url in urls:
if pattern.search(url):
file_paths += self._write_to_files([url], output_dir=output_dir)
file_paths += self._write_to_files(
[url], output_dir=output_dir, extract_hidden_text=extract_hidden_text
)
else:
file_paths += self._write_to_files(urls, output_dir=output_dir)
file_paths += self._write_to_files(urls, output_dir=output_dir, extract_hidden_text=extract_hidden_text)
# follow one level of sublinks if requested
if crawler_depth == 1:
sub_links: Dict[str, List] = {}
@ -170,20 +179,32 @@ class Crawler(BaseComponent):
)
for url, extracted_sublink in sub_links.items():
file_paths += self._write_to_files(
extracted_sublink, output_dir=output_dir, base_url=url, id_hash_keys=id_hash_keys
extracted_sublink,
output_dir=output_dir,
base_url=url,
id_hash_keys=id_hash_keys,
extract_hidden_text=extract_hidden_text,
)
return file_paths
def _write_to_files(
self, urls: List[str], output_dir: Path, base_url: str = None, id_hash_keys: Optional[List[str]] = None
self,
urls: List[str],
output_dir: Path,
extract_hidden_text: bool,
base_url: str = None,
id_hash_keys: Optional[List[str]] = None,
) -> List[Path]:
paths = []
for link in urls:
logger.info(f"writing contents from `{link}`")
self.driver.get(link)
el = self.driver.find_element_by_tag_name("body")
text = el.text
if extract_hidden_text:
text = el.get_attribute("textContent")
else:
text = el.text
link_split_values = link.replace("https://", "").split("/")
file_name = f"{'_'.join(link_split_values)}.json"
@ -210,6 +231,7 @@ class Crawler(BaseComponent):
overwrite_existing_files: Optional[bool] = None,
return_documents: Optional[bool] = False,
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = True,
) -> Tuple[Dict, str]:
"""
Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
@ -227,6 +249,8 @@ class Crawler(BaseComponent):
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
:param extract_hidden_text: Whether to extract the hidden text contained in page.
E.g. the text can be inside a span with style="display: none"
:return: Tuple({"paths": List of filepaths, ...}, Name of output edge)
"""
@ -237,6 +261,7 @@ class Crawler(BaseComponent):
crawler_depth=crawler_depth,
filter_urls=filter_urls,
overwrite_existing_files=overwrite_existing_files,
extract_hidden_text=extract_hidden_text,
)
if return_documents:
crawled_data = []
@ -258,6 +283,7 @@ class Crawler(BaseComponent):
overwrite_existing_files: Optional[bool] = None,
return_documents: Optional[bool] = False,
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = True,
):
return self.run(
output_dir=output_dir,
@ -267,6 +293,7 @@ class Crawler(BaseComponent):
overwrite_existing_files=overwrite_existing_files,
return_documents=return_documents,
id_hash_keys=id_hash_keys,
extract_hidden_text=extract_hidden_text,
)
@staticmethod

View File

@ -24,7 +24,11 @@ def content_match(crawler: Crawler, url: str, crawled_page: Path):
"""
crawler.driver.get(url)
body = crawler.driver.find_element_by_tag_name("body")
expected_crawled_content = body.text
if crawler.extract_hidden_text:
expected_crawled_content = body.get_attribute("textContent")
else:
expected_crawled_content = body.text
with open(crawled_page, "r") as crawled_file:
page_data = json.load(crawled_file)
@ -142,3 +146,18 @@ def test_crawler_return_document(test_url, tmp_path):
file_content = json.load(doc_file)
assert file_content["meta"] == document.meta
assert file_content["content"] == document.content
def test_crawler_extract_hidden_text(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path)
documents, _ = crawler.run(
urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=True, crawler_depth=0, return_documents=True
)
crawled_content = documents["documents"][0].content
assert "hidden text" in crawled_content
documents, _ = crawler.run(
urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=False, crawler_depth=0, return_documents=True
)
crawled_content = documents["documents"][0].content
assert "hidden text" not in crawled_content

View File

@ -0,0 +1,10 @@
<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
</head>
<body>
<p>visible text</p>
<p style="display: none">hidden text</p>
</body>
</html>