mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-24 08:28:22 +00:00
Make crawler extract also hidden text (#2642)
* make crawler extract also hidden text * Update Documentation & Code Style * try to adapt test for extract_hidden_text * Update Documentation & Code Style * fix test bug * fix bug in test * added test for hidden text" * Update Documentation & Code Style * fix bug in test * Update Documentation & Code Style * fix test * Update Documentation & Code Style * fix other test bug Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
c8f9e1b76c
commit
c178f60e3a
@ -27,7 +27,7 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
|
|||||||
#### Crawler.\_\_init\_\_
|
#### Crawler.\_\_init\_\_
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None)
|
def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True)
|
||||||
```
|
```
|
||||||
|
|
||||||
Init object with basic params for crawling (can be overwritten later).
|
Init object with basic params for crawling (can be overwritten later).
|
||||||
@ -46,13 +46,15 @@ All URLs not matching at least one of the regular expressions will be dropped.
|
|||||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||||
In this case the id will be generated by using the content and the defined metadata.
|
In this case the id will be generated by using the content and the defined metadata.
|
||||||
|
- `extract_hidden_text`: Whether to extract the hidden text contained in page.
|
||||||
|
E.g. the text can be inside a span with style="display: none"
|
||||||
|
|
||||||
<a id="crawler.Crawler.crawl"></a>
|
<a id="crawler.Crawler.crawl"></a>
|
||||||
|
|
||||||
#### Crawler.crawl
|
#### Crawler.crawl
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None) -> List[Path]
|
def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None) -> List[Path]
|
||||||
```
|
```
|
||||||
|
|
||||||
Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
|
Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
|
||||||
@ -86,7 +88,7 @@ List of paths where the crawled webpages got stored
|
|||||||
#### Crawler.run
|
#### Crawler.run
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None) -> Tuple[Dict, str]
|
def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True) -> Tuple[Dict, str]
|
||||||
```
|
```
|
||||||
|
|
||||||
Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
|
Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
|
||||||
@ -106,6 +108,8 @@ All URLs not matching at least one of the regular expressions will be dropped.
|
|||||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||||
In this case the id will be generated by using the content and the defined metadata.
|
In this case the id will be generated by using the content and the defined metadata.
|
||||||
|
- `extract_hidden_text`: Whether to extract the hidden text contained in page.
|
||||||
|
E.g. the text can be inside a span with style="display: none"
|
||||||
|
|
||||||
**Returns**:
|
**Returns**:
|
||||||
|
|
||||||
|
@ -1879,6 +1879,10 @@
|
|||||||
"items": {
|
"items": {
|
||||||
"type": "string"
|
"type": "string"
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"extract_hidden_text": {
|
||||||
|
"title": "Extract Hidden Text",
|
||||||
|
"default": true
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"required": [
|
"required": [
|
||||||
|
@ -47,6 +47,7 @@ class Crawler(BaseComponent):
|
|||||||
filter_urls: Optional[List] = None,
|
filter_urls: Optional[List] = None,
|
||||||
overwrite_existing_files=True,
|
overwrite_existing_files=True,
|
||||||
id_hash_keys: Optional[List[str]] = None,
|
id_hash_keys: Optional[List[str]] = None,
|
||||||
|
extract_hidden_text=True,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Init object with basic params for crawling (can be overwritten later).
|
Init object with basic params for crawling (can be overwritten later).
|
||||||
@ -63,6 +64,8 @@ class Crawler(BaseComponent):
|
|||||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||||
In this case the id will be generated by using the content and the defined metadata.
|
In this case the id will be generated by using the content and the defined metadata.
|
||||||
|
:param extract_hidden_text: Whether to extract the hidden text contained in page.
|
||||||
|
E.g. the text can be inside a span with style="display: none"
|
||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
@ -93,6 +96,7 @@ class Crawler(BaseComponent):
|
|||||||
self.filter_urls = filter_urls
|
self.filter_urls = filter_urls
|
||||||
self.overwrite_existing_files = overwrite_existing_files
|
self.overwrite_existing_files = overwrite_existing_files
|
||||||
self.id_hash_keys = id_hash_keys
|
self.id_hash_keys = id_hash_keys
|
||||||
|
self.extract_hidden_text = extract_hidden_text
|
||||||
|
|
||||||
def crawl(
|
def crawl(
|
||||||
self,
|
self,
|
||||||
@ -102,6 +106,7 @@ class Crawler(BaseComponent):
|
|||||||
filter_urls: Optional[List] = None,
|
filter_urls: Optional[List] = None,
|
||||||
overwrite_existing_files: Optional[bool] = None,
|
overwrite_existing_files: Optional[bool] = None,
|
||||||
id_hash_keys: Optional[List[str]] = None,
|
id_hash_keys: Optional[List[str]] = None,
|
||||||
|
extract_hidden_text: Optional[bool] = None,
|
||||||
) -> List[Path]:
|
) -> List[Path]:
|
||||||
"""
|
"""
|
||||||
Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
|
Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
|
||||||
@ -138,6 +143,8 @@ class Crawler(BaseComponent):
|
|||||||
overwrite_existing_files = self.overwrite_existing_files
|
overwrite_existing_files = self.overwrite_existing_files
|
||||||
if crawler_depth is None:
|
if crawler_depth is None:
|
||||||
crawler_depth = self.crawler_depth
|
crawler_depth = self.crawler_depth
|
||||||
|
if extract_hidden_text is None:
|
||||||
|
extract_hidden_text = self.extract_hidden_text
|
||||||
|
|
||||||
output_dir = Path(output_dir)
|
output_dir = Path(output_dir)
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
@ -155,9 +162,11 @@ class Crawler(BaseComponent):
|
|||||||
pattern = re.compile("|".join(filter_urls))
|
pattern = re.compile("|".join(filter_urls))
|
||||||
for url in urls:
|
for url in urls:
|
||||||
if pattern.search(url):
|
if pattern.search(url):
|
||||||
file_paths += self._write_to_files([url], output_dir=output_dir)
|
file_paths += self._write_to_files(
|
||||||
|
[url], output_dir=output_dir, extract_hidden_text=extract_hidden_text
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
file_paths += self._write_to_files(urls, output_dir=output_dir)
|
file_paths += self._write_to_files(urls, output_dir=output_dir, extract_hidden_text=extract_hidden_text)
|
||||||
# follow one level of sublinks if requested
|
# follow one level of sublinks if requested
|
||||||
if crawler_depth == 1:
|
if crawler_depth == 1:
|
||||||
sub_links: Dict[str, List] = {}
|
sub_links: Dict[str, List] = {}
|
||||||
@ -170,19 +179,31 @@ class Crawler(BaseComponent):
|
|||||||
)
|
)
|
||||||
for url, extracted_sublink in sub_links.items():
|
for url, extracted_sublink in sub_links.items():
|
||||||
file_paths += self._write_to_files(
|
file_paths += self._write_to_files(
|
||||||
extracted_sublink, output_dir=output_dir, base_url=url, id_hash_keys=id_hash_keys
|
extracted_sublink,
|
||||||
|
output_dir=output_dir,
|
||||||
|
base_url=url,
|
||||||
|
id_hash_keys=id_hash_keys,
|
||||||
|
extract_hidden_text=extract_hidden_text,
|
||||||
)
|
)
|
||||||
|
|
||||||
return file_paths
|
return file_paths
|
||||||
|
|
||||||
def _write_to_files(
|
def _write_to_files(
|
||||||
self, urls: List[str], output_dir: Path, base_url: str = None, id_hash_keys: Optional[List[str]] = None
|
self,
|
||||||
|
urls: List[str],
|
||||||
|
output_dir: Path,
|
||||||
|
extract_hidden_text: bool,
|
||||||
|
base_url: str = None,
|
||||||
|
id_hash_keys: Optional[List[str]] = None,
|
||||||
) -> List[Path]:
|
) -> List[Path]:
|
||||||
paths = []
|
paths = []
|
||||||
for link in urls:
|
for link in urls:
|
||||||
logger.info(f"writing contents from `{link}`")
|
logger.info(f"writing contents from `{link}`")
|
||||||
self.driver.get(link)
|
self.driver.get(link)
|
||||||
el = self.driver.find_element_by_tag_name("body")
|
el = self.driver.find_element_by_tag_name("body")
|
||||||
|
if extract_hidden_text:
|
||||||
|
text = el.get_attribute("textContent")
|
||||||
|
else:
|
||||||
text = el.text
|
text = el.text
|
||||||
|
|
||||||
link_split_values = link.replace("https://", "").split("/")
|
link_split_values = link.replace("https://", "").split("/")
|
||||||
@ -210,6 +231,7 @@ class Crawler(BaseComponent):
|
|||||||
overwrite_existing_files: Optional[bool] = None,
|
overwrite_existing_files: Optional[bool] = None,
|
||||||
return_documents: Optional[bool] = False,
|
return_documents: Optional[bool] = False,
|
||||||
id_hash_keys: Optional[List[str]] = None,
|
id_hash_keys: Optional[List[str]] = None,
|
||||||
|
extract_hidden_text: Optional[bool] = True,
|
||||||
) -> Tuple[Dict, str]:
|
) -> Tuple[Dict, str]:
|
||||||
"""
|
"""
|
||||||
Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
|
Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
|
||||||
@ -227,6 +249,8 @@ class Crawler(BaseComponent):
|
|||||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||||
In this case the id will be generated by using the content and the defined metadata.
|
In this case the id will be generated by using the content and the defined metadata.
|
||||||
|
:param extract_hidden_text: Whether to extract the hidden text contained in page.
|
||||||
|
E.g. the text can be inside a span with style="display: none"
|
||||||
|
|
||||||
:return: Tuple({"paths": List of filepaths, ...}, Name of output edge)
|
:return: Tuple({"paths": List of filepaths, ...}, Name of output edge)
|
||||||
"""
|
"""
|
||||||
@ -237,6 +261,7 @@ class Crawler(BaseComponent):
|
|||||||
crawler_depth=crawler_depth,
|
crawler_depth=crawler_depth,
|
||||||
filter_urls=filter_urls,
|
filter_urls=filter_urls,
|
||||||
overwrite_existing_files=overwrite_existing_files,
|
overwrite_existing_files=overwrite_existing_files,
|
||||||
|
extract_hidden_text=extract_hidden_text,
|
||||||
)
|
)
|
||||||
if return_documents:
|
if return_documents:
|
||||||
crawled_data = []
|
crawled_data = []
|
||||||
@ -258,6 +283,7 @@ class Crawler(BaseComponent):
|
|||||||
overwrite_existing_files: Optional[bool] = None,
|
overwrite_existing_files: Optional[bool] = None,
|
||||||
return_documents: Optional[bool] = False,
|
return_documents: Optional[bool] = False,
|
||||||
id_hash_keys: Optional[List[str]] = None,
|
id_hash_keys: Optional[List[str]] = None,
|
||||||
|
extract_hidden_text: Optional[bool] = True,
|
||||||
):
|
):
|
||||||
return self.run(
|
return self.run(
|
||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
@ -267,6 +293,7 @@ class Crawler(BaseComponent):
|
|||||||
overwrite_existing_files=overwrite_existing_files,
|
overwrite_existing_files=overwrite_existing_files,
|
||||||
return_documents=return_documents,
|
return_documents=return_documents,
|
||||||
id_hash_keys=id_hash_keys,
|
id_hash_keys=id_hash_keys,
|
||||||
|
extract_hidden_text=extract_hidden_text,
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -24,6 +24,10 @@ def content_match(crawler: Crawler, url: str, crawled_page: Path):
|
|||||||
"""
|
"""
|
||||||
crawler.driver.get(url)
|
crawler.driver.get(url)
|
||||||
body = crawler.driver.find_element_by_tag_name("body")
|
body = crawler.driver.find_element_by_tag_name("body")
|
||||||
|
|
||||||
|
if crawler.extract_hidden_text:
|
||||||
|
expected_crawled_content = body.get_attribute("textContent")
|
||||||
|
else:
|
||||||
expected_crawled_content = body.text
|
expected_crawled_content = body.text
|
||||||
|
|
||||||
with open(crawled_page, "r") as crawled_file:
|
with open(crawled_page, "r") as crawled_file:
|
||||||
@ -142,3 +146,18 @@ def test_crawler_return_document(test_url, tmp_path):
|
|||||||
file_content = json.load(doc_file)
|
file_content = json.load(doc_file)
|
||||||
assert file_content["meta"] == document.meta
|
assert file_content["meta"] == document.meta
|
||||||
assert file_content["content"] == document.content
|
assert file_content["content"] == document.content
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawler_extract_hidden_text(test_url, tmp_path):
|
||||||
|
crawler = Crawler(output_dir=tmp_path)
|
||||||
|
documents, _ = crawler.run(
|
||||||
|
urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=True, crawler_depth=0, return_documents=True
|
||||||
|
)
|
||||||
|
crawled_content = documents["documents"][0].content
|
||||||
|
assert "hidden text" in crawled_content
|
||||||
|
|
||||||
|
documents, _ = crawler.run(
|
||||||
|
urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=False, crawler_depth=0, return_documents=True
|
||||||
|
)
|
||||||
|
crawled_content = documents["documents"][0].content
|
||||||
|
assert "hidden text" not in crawled_content
|
||||||
|
10
test/samples/crawler/page_w_hidden_text.html
Normal file
10
test/samples/crawler/page_w_hidden_text.html
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Test Page</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>visible text</p>
|
||||||
|
<p style="display: none">hidden text</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
Loading…
x
Reference in New Issue
Block a user