feat: Add headline extraction to ParsrConverter (#3488)

* Add headline extraction to ParsrConverter

* Add sample PDF file

* Add test

* Use extract_headlines if set in convert method

* Integrate PR feedback
This commit is contained in:
bogdankostic 2022-10-31 19:00:02 +01:00 committed by GitHub
parent 8ddeda811a
commit 60224412bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 72 additions and 3 deletions

View File

@ -1,5 +1,5 @@
# pylint: disable=missing-timeout
import sys
from typing import Optional, Dict, List, Any
try:
@ -44,6 +44,7 @@ class ParsrConverter(BaseConverter):
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None,
add_page_number: bool = True,
extract_headlines: bool = True,
):
"""
:param parsr_url: URL endpoint to Parsr"s REST API.
@ -69,6 +70,7 @@ class ParsrConverter(BaseConverter):
In this case the id will be generated by using the content and the defined metadata.
:param add_page_number: Adds the number of the page a table occurs in to the Document's meta field
`"page"`.
:param extract_headlines: Whether to extract headings from the PDF file.
"""
super().__init__(valid_languages=valid_languages, id_hash_keys=id_hash_keys)
@ -96,6 +98,7 @@ class ParsrConverter(BaseConverter):
self.remove_page_footers = remove_page_footers
self.remove_table_of_contents = remove_table_of_contents
self.add_page_number = add_page_number
self.extract_headlines = extract_headlines
super().__init__(valid_languages=valid_languages)
def convert(
@ -106,6 +109,7 @@ class ParsrConverter(BaseConverter):
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "utf-8",
id_hash_keys: Optional[List[str]] = None,
extract_headlines: Optional[bool] = None,
) -> List[Document]:
"""
Extract text and tables from a PDF or DOCX using the open-source Parsr tool.
@ -124,12 +128,16 @@ class ParsrConverter(BaseConverter):
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
:param extract_headlines: Whether to extract headings from the PDF file.
"""
if valid_languages is None:
valid_languages = self.valid_languages
if id_hash_keys is None:
id_hash_keys = self.id_hash_keys
if extract_headlines is None:
extract_headlines = self.extract_headlines
if meta is None:
meta = {}
with open(file_path, "rb") as pdf_file:
# Send file to Parsr
@ -154,16 +162,28 @@ class ParsrConverter(BaseConverter):
# Convert Parsr output to Haystack Documents
text = ""
tables = []
headlines = []
for page_idx, page in enumerate(parsr_output["pages"]):
for elem_idx, element in enumerate(page["elements"]):
if element["type"] in ["paragraph", "heading", "table-of-contents"]:
current_paragraph = self._convert_text_element(element)
if current_paragraph:
if element["type"] == "heading" and extract_headlines:
headlines.append(
{"headline": current_paragraph, "start_idx": len(text), "level": element["level"]}
)
text += f"{current_paragraph}\n\n"
elif element["type"] == "table":
table = self._convert_table_element(
element, parsr_output["pages"], page_idx, elem_idx, meta, id_hash_keys
element,
parsr_output["pages"],
page_idx,
elem_idx,
headlines,
extract_headlines,
meta,
id_hash_keys,
)
tables.append(table)
if text[-1] != "\f":
@ -184,6 +204,9 @@ class ParsrConverter(BaseConverter):
f"been decoded in the correct text format."
)
if extract_headlines:
meta["headlines"] = headlines
docs = tables + [Document(content=text.strip(), meta=meta, id_hash_keys=id_hash_keys)]
return docs
@ -219,6 +242,8 @@ class ParsrConverter(BaseConverter):
all_pages: List[Dict],
page_idx: int,
elem_idx: int,
headlines: List[Dict],
extract_headlines: bool,
meta: Optional[Dict[str, Any]] = None,
id_hash_keys: Optional[List[str]] = None,
) -> Document:
@ -290,5 +315,17 @@ class ParsrConverter(BaseConverter):
if self.add_page_number:
table_meta["page"] = page_idx + 1
if extract_headlines:
relevant_headlines = []
cur_lowest_headline_level = sys.maxsize
for headline in reversed(headlines):
if headline["level"] < cur_lowest_headline_level:
headline_copy = copy.deepcopy(headline)
headline_copy["start_idx"] = None
relevant_headlines.append(headline_copy)
cur_lowest_headline_level = headline_copy["level"]
relevant_headlines = relevant_headlines[::-1]
table_meta["headlines"] = relevant_headlines
table_df = pd.DataFrame(columns=table_list[0], data=table_list[1:])
return Document(content=table_df, content_type="table", meta=table_meta, id_hash_keys=id_hash_keys)

View File

@ -220,6 +220,38 @@ def test_parsr_converter():
assert docs[1].content.endswith("Page 4 of Sample PDF\n… the page 3 is empty.")
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Parsr not running on Windows CI")
def test_parsr_converter_headline_extraction():
expected_headlines = [
[("Lorem ipsum", 1), ("Cras fringilla ipsum magna, in fringilla dui commodo\na.", 2)],
[
("Lorem ipsum", 1),
("Lorem ipsum dolor sit amet, consectetur adipiscing\nelit. Nunc ac faucibus odio.", 2),
("Cras fringilla ipsum magna, in fringilla dui commodo\na.", 2),
("Lorem ipsum dolor sit amet, consectetur adipiscing\nelit.", 2),
("Maecenas mauris lectus, lobortis et purus mattis, blandit\ndictum tellus.", 2),
("In eleifend velit vitae libero sollicitudin euismod.", 2),
],
]
converter = ParsrConverter()
docs = converter.convert(file_path=str((SAMPLES_PATH / "pdf" / "sample_pdf_4.pdf").absolute()))
assert len(docs) == 2
for doc, expectation in zip(docs, expected_headlines):
for extracted_headline, (expected_headline, expected_level) in zip(doc.meta["headlines"], expectation):
# Check if correct headline and level is extracted
assert extracted_headline["headline"] == expected_headline
assert extracted_headline["level"] == expected_level
# Check if correct start_idx is extracted
if doc.content_type == "text":
start_idx = extracted_headline["start_idx"]
hl_len = len(extracted_headline["headline"])
assert extracted_headline["headline"] == doc.content[start_idx : start_idx + hl_len]
def test_id_hash_keys_from_pipeline_params():
doc_path = SAMPLES_PATH / "docs" / "doc_1.txt"
meta_1 = {"key": "a"}

Binary file not shown.