mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-23 16:08:19 +00:00
Make crawler extract also hidden text (#2642)
* make crawler extract also hidden text * Update Documentation & Code Style * try to adapt test for extract_hidden_text * Update Documentation & Code Style * fix test bug * fix bug in test * added test for hidden text" * Update Documentation & Code Style * fix bug in test * Update Documentation & Code Style * fix test * Update Documentation & Code Style * fix other test bug Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
c8f9e1b76c
commit
c178f60e3a
@ -27,7 +27,7 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
|
||||
#### Crawler.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None)
|
||||
def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True)
|
||||
```
|
||||
|
||||
Init object with basic params for crawling (can be overwritten later).
|
||||
@ -46,13 +46,15 @@ All URLs not matching at least one of the regular expressions will be dropped.
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
- `extract_hidden_text`: Whether to extract the hidden text contained in page.
|
||||
E.g. the text can be inside a span with style="display: none"
|
||||
|
||||
<a id="crawler.Crawler.crawl"></a>
|
||||
|
||||
#### Crawler.crawl
|
||||
|
||||
```python
|
||||
def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None) -> List[Path]
|
||||
def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None) -> List[Path]
|
||||
```
|
||||
|
||||
Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
|
||||
@ -86,7 +88,7 @@ List of paths where the crawled webpages got stored
|
||||
#### Crawler.run
|
||||
|
||||
```python
|
||||
def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None) -> Tuple[Dict, str]
|
||||
def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True) -> Tuple[Dict, str]
|
||||
```
|
||||
|
||||
Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
|
||||
@ -106,6 +108,8 @@ All URLs not matching at least one of the regular expressions will be dropped.
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
- `extract_hidden_text`: Whether to extract the hidden text contained in page.
|
||||
E.g. the text can be inside a span with style="display: none"
|
||||
|
||||
**Returns**:
|
||||
|
||||
|
@ -1879,6 +1879,10 @@
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"extract_hidden_text": {
|
||||
"title": "Extract Hidden Text",
|
||||
"default": true
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
|
@ -47,6 +47,7 @@ class Crawler(BaseComponent):
|
||||
filter_urls: Optional[List] = None,
|
||||
overwrite_existing_files=True,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
extract_hidden_text=True,
|
||||
):
|
||||
"""
|
||||
Init object with basic params for crawling (can be overwritten later).
|
||||
@ -63,6 +64,8 @@ class Crawler(BaseComponent):
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
:param extract_hidden_text: Whether to extract the hidden text contained in page.
|
||||
E.g. the text can be inside a span with style="display: none"
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
@ -93,6 +96,7 @@ class Crawler(BaseComponent):
|
||||
self.filter_urls = filter_urls
|
||||
self.overwrite_existing_files = overwrite_existing_files
|
||||
self.id_hash_keys = id_hash_keys
|
||||
self.extract_hidden_text = extract_hidden_text
|
||||
|
||||
def crawl(
|
||||
self,
|
||||
@ -102,6 +106,7 @@ class Crawler(BaseComponent):
|
||||
filter_urls: Optional[List] = None,
|
||||
overwrite_existing_files: Optional[bool] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
extract_hidden_text: Optional[bool] = None,
|
||||
) -> List[Path]:
|
||||
"""
|
||||
Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
|
||||
@ -138,6 +143,8 @@ class Crawler(BaseComponent):
|
||||
overwrite_existing_files = self.overwrite_existing_files
|
||||
if crawler_depth is None:
|
||||
crawler_depth = self.crawler_depth
|
||||
if extract_hidden_text is None:
|
||||
extract_hidden_text = self.extract_hidden_text
|
||||
|
||||
output_dir = Path(output_dir)
|
||||
if not output_dir.exists():
|
||||
@ -155,9 +162,11 @@ class Crawler(BaseComponent):
|
||||
pattern = re.compile("|".join(filter_urls))
|
||||
for url in urls:
|
||||
if pattern.search(url):
|
||||
file_paths += self._write_to_files([url], output_dir=output_dir)
|
||||
file_paths += self._write_to_files(
|
||||
[url], output_dir=output_dir, extract_hidden_text=extract_hidden_text
|
||||
)
|
||||
else:
|
||||
file_paths += self._write_to_files(urls, output_dir=output_dir)
|
||||
file_paths += self._write_to_files(urls, output_dir=output_dir, extract_hidden_text=extract_hidden_text)
|
||||
# follow one level of sublinks if requested
|
||||
if crawler_depth == 1:
|
||||
sub_links: Dict[str, List] = {}
|
||||
@ -170,20 +179,32 @@ class Crawler(BaseComponent):
|
||||
)
|
||||
for url, extracted_sublink in sub_links.items():
|
||||
file_paths += self._write_to_files(
|
||||
extracted_sublink, output_dir=output_dir, base_url=url, id_hash_keys=id_hash_keys
|
||||
extracted_sublink,
|
||||
output_dir=output_dir,
|
||||
base_url=url,
|
||||
id_hash_keys=id_hash_keys,
|
||||
extract_hidden_text=extract_hidden_text,
|
||||
)
|
||||
|
||||
return file_paths
|
||||
|
||||
def _write_to_files(
|
||||
self, urls: List[str], output_dir: Path, base_url: str = None, id_hash_keys: Optional[List[str]] = None
|
||||
self,
|
||||
urls: List[str],
|
||||
output_dir: Path,
|
||||
extract_hidden_text: bool,
|
||||
base_url: str = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
) -> List[Path]:
|
||||
paths = []
|
||||
for link in urls:
|
||||
logger.info(f"writing contents from `{link}`")
|
||||
self.driver.get(link)
|
||||
el = self.driver.find_element_by_tag_name("body")
|
||||
text = el.text
|
||||
if extract_hidden_text:
|
||||
text = el.get_attribute("textContent")
|
||||
else:
|
||||
text = el.text
|
||||
|
||||
link_split_values = link.replace("https://", "").split("/")
|
||||
file_name = f"{'_'.join(link_split_values)}.json"
|
||||
@ -210,6 +231,7 @@ class Crawler(BaseComponent):
|
||||
overwrite_existing_files: Optional[bool] = None,
|
||||
return_documents: Optional[bool] = False,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
extract_hidden_text: Optional[bool] = True,
|
||||
) -> Tuple[Dict, str]:
|
||||
"""
|
||||
Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
|
||||
@ -227,6 +249,8 @@ class Crawler(BaseComponent):
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
:param extract_hidden_text: Whether to extract the hidden text contained in page.
|
||||
E.g. the text can be inside a span with style="display: none"
|
||||
|
||||
:return: Tuple({"paths": List of filepaths, ...}, Name of output edge)
|
||||
"""
|
||||
@ -237,6 +261,7 @@ class Crawler(BaseComponent):
|
||||
crawler_depth=crawler_depth,
|
||||
filter_urls=filter_urls,
|
||||
overwrite_existing_files=overwrite_existing_files,
|
||||
extract_hidden_text=extract_hidden_text,
|
||||
)
|
||||
if return_documents:
|
||||
crawled_data = []
|
||||
@ -258,6 +283,7 @@ class Crawler(BaseComponent):
|
||||
overwrite_existing_files: Optional[bool] = None,
|
||||
return_documents: Optional[bool] = False,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
extract_hidden_text: Optional[bool] = True,
|
||||
):
|
||||
return self.run(
|
||||
output_dir=output_dir,
|
||||
@ -267,6 +293,7 @@ class Crawler(BaseComponent):
|
||||
overwrite_existing_files=overwrite_existing_files,
|
||||
return_documents=return_documents,
|
||||
id_hash_keys=id_hash_keys,
|
||||
extract_hidden_text=extract_hidden_text,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
|
@ -24,7 +24,11 @@ def content_match(crawler: Crawler, url: str, crawled_page: Path):
|
||||
"""
|
||||
crawler.driver.get(url)
|
||||
body = crawler.driver.find_element_by_tag_name("body")
|
||||
expected_crawled_content = body.text
|
||||
|
||||
if crawler.extract_hidden_text:
|
||||
expected_crawled_content = body.get_attribute("textContent")
|
||||
else:
|
||||
expected_crawled_content = body.text
|
||||
|
||||
with open(crawled_page, "r") as crawled_file:
|
||||
page_data = json.load(crawled_file)
|
||||
@ -142,3 +146,18 @@ def test_crawler_return_document(test_url, tmp_path):
|
||||
file_content = json.load(doc_file)
|
||||
assert file_content["meta"] == document.meta
|
||||
assert file_content["content"] == document.content
|
||||
|
||||
|
||||
def test_crawler_extract_hidden_text(test_url, tmp_path):
|
||||
crawler = Crawler(output_dir=tmp_path)
|
||||
documents, _ = crawler.run(
|
||||
urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=True, crawler_depth=0, return_documents=True
|
||||
)
|
||||
crawled_content = documents["documents"][0].content
|
||||
assert "hidden text" in crawled_content
|
||||
|
||||
documents, _ = crawler.run(
|
||||
urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=False, crawler_depth=0, return_documents=True
|
||||
)
|
||||
crawled_content = documents["documents"][0].content
|
||||
assert "hidden text" not in crawled_content
|
||||
|
10
test/samples/crawler/page_w_hidden_text.html
Normal file
10
test/samples/crawler/page_w_hidden_text.html
Normal file
@ -0,0 +1,10 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Test Page</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>visible text</p>
|
||||
<p style="display: none">hidden text</p>
|
||||
</body>
|
||||
</html>
|
Loading…
x
Reference in New Issue
Block a user