mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-06 03:57:19 +00:00
feat: Add headline extraction to ParsrConverter (#3488)
* Add headline extraction to ParsrConverter * Add sample PDF file * Add test * Use extract_headlines if set in convert method * Integrate PR feedback
This commit is contained in:
parent
8ddeda811a
commit
60224412bc
@ -1,5 +1,5 @@
|
||||
# pylint: disable=missing-timeout
|
||||
|
||||
import sys
|
||||
from typing import Optional, Dict, List, Any
|
||||
|
||||
try:
|
||||
@ -44,6 +44,7 @@ class ParsrConverter(BaseConverter):
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
add_page_number: bool = True,
|
||||
extract_headlines: bool = True,
|
||||
):
|
||||
"""
|
||||
:param parsr_url: URL endpoint to Parsr"s REST API.
|
||||
@ -69,6 +70,7 @@ class ParsrConverter(BaseConverter):
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
:param add_page_number: Adds the number of the page a table occurs in to the Document's meta field
|
||||
`"page"`.
|
||||
:param extract_headlines: Whether to extract headings from the PDF file.
|
||||
"""
|
||||
super().__init__(valid_languages=valid_languages, id_hash_keys=id_hash_keys)
|
||||
|
||||
@ -96,6 +98,7 @@ class ParsrConverter(BaseConverter):
|
||||
self.remove_page_footers = remove_page_footers
|
||||
self.remove_table_of_contents = remove_table_of_contents
|
||||
self.add_page_number = add_page_number
|
||||
self.extract_headlines = extract_headlines
|
||||
super().__init__(valid_languages=valid_languages)
|
||||
|
||||
def convert(
|
||||
@ -106,6 +109,7 @@ class ParsrConverter(BaseConverter):
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
extract_headlines: Optional[bool] = None,
|
||||
) -> List[Document]:
|
||||
"""
|
||||
Extract text and tables from a PDF or DOCX using the open-source Parsr tool.
|
||||
@ -124,12 +128,16 @@ class ParsrConverter(BaseConverter):
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
:param extract_headlines: Whether to extract headings from the PDF file.
|
||||
"""
|
||||
if valid_languages is None:
|
||||
valid_languages = self.valid_languages
|
||||
|
||||
if id_hash_keys is None:
|
||||
id_hash_keys = self.id_hash_keys
|
||||
if extract_headlines is None:
|
||||
extract_headlines = self.extract_headlines
|
||||
if meta is None:
|
||||
meta = {}
|
||||
|
||||
with open(file_path, "rb") as pdf_file:
|
||||
# Send file to Parsr
|
||||
@ -154,16 +162,28 @@ class ParsrConverter(BaseConverter):
|
||||
# Convert Parsr output to Haystack Documents
|
||||
text = ""
|
||||
tables = []
|
||||
headlines = []
|
||||
for page_idx, page in enumerate(parsr_output["pages"]):
|
||||
for elem_idx, element in enumerate(page["elements"]):
|
||||
if element["type"] in ["paragraph", "heading", "table-of-contents"]:
|
||||
current_paragraph = self._convert_text_element(element)
|
||||
if current_paragraph:
|
||||
if element["type"] == "heading" and extract_headlines:
|
||||
headlines.append(
|
||||
{"headline": current_paragraph, "start_idx": len(text), "level": element["level"]}
|
||||
)
|
||||
text += f"{current_paragraph}\n\n"
|
||||
|
||||
elif element["type"] == "table":
|
||||
table = self._convert_table_element(
|
||||
element, parsr_output["pages"], page_idx, elem_idx, meta, id_hash_keys
|
||||
element,
|
||||
parsr_output["pages"],
|
||||
page_idx,
|
||||
elem_idx,
|
||||
headlines,
|
||||
extract_headlines,
|
||||
meta,
|
||||
id_hash_keys,
|
||||
)
|
||||
tables.append(table)
|
||||
if text[-1] != "\f":
|
||||
@ -184,6 +204,9 @@ class ParsrConverter(BaseConverter):
|
||||
f"been decoded in the correct text format."
|
||||
)
|
||||
|
||||
if extract_headlines:
|
||||
meta["headlines"] = headlines
|
||||
|
||||
docs = tables + [Document(content=text.strip(), meta=meta, id_hash_keys=id_hash_keys)]
|
||||
return docs
|
||||
|
||||
@ -219,6 +242,8 @@ class ParsrConverter(BaseConverter):
|
||||
all_pages: List[Dict],
|
||||
page_idx: int,
|
||||
elem_idx: int,
|
||||
headlines: List[Dict],
|
||||
extract_headlines: bool,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
) -> Document:
|
||||
@ -290,5 +315,17 @@ class ParsrConverter(BaseConverter):
|
||||
if self.add_page_number:
|
||||
table_meta["page"] = page_idx + 1
|
||||
|
||||
if extract_headlines:
|
||||
relevant_headlines = []
|
||||
cur_lowest_headline_level = sys.maxsize
|
||||
for headline in reversed(headlines):
|
||||
if headline["level"] < cur_lowest_headline_level:
|
||||
headline_copy = copy.deepcopy(headline)
|
||||
headline_copy["start_idx"] = None
|
||||
relevant_headlines.append(headline_copy)
|
||||
cur_lowest_headline_level = headline_copy["level"]
|
||||
relevant_headlines = relevant_headlines[::-1]
|
||||
table_meta["headlines"] = relevant_headlines
|
||||
|
||||
table_df = pd.DataFrame(columns=table_list[0], data=table_list[1:])
|
||||
return Document(content=table_df, content_type="table", meta=table_meta, id_hash_keys=id_hash_keys)
|
||||
|
||||
@ -220,6 +220,38 @@ def test_parsr_converter():
|
||||
assert docs[1].content.endswith("Page 4 of Sample PDF\n… the page 3 is empty.")
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Parsr not running on Windows CI")
|
||||
def test_parsr_converter_headline_extraction():
|
||||
expected_headlines = [
|
||||
[("Lorem ipsum", 1), ("Cras fringilla ipsum magna, in fringilla dui commodo\na.", 2)],
|
||||
[
|
||||
("Lorem ipsum", 1),
|
||||
("Lorem ipsum dolor sit amet, consectetur adipiscing\nelit. Nunc ac faucibus odio.", 2),
|
||||
("Cras fringilla ipsum magna, in fringilla dui commodo\na.", 2),
|
||||
("Lorem ipsum dolor sit amet, consectetur adipiscing\nelit.", 2),
|
||||
("Maecenas mauris lectus, lobortis et purus mattis, blandit\ndictum tellus.", 2),
|
||||
("In eleifend velit vitae libero sollicitudin euismod.", 2),
|
||||
],
|
||||
]
|
||||
|
||||
converter = ParsrConverter()
|
||||
|
||||
docs = converter.convert(file_path=str((SAMPLES_PATH / "pdf" / "sample_pdf_4.pdf").absolute()))
|
||||
assert len(docs) == 2
|
||||
|
||||
for doc, expectation in zip(docs, expected_headlines):
|
||||
for extracted_headline, (expected_headline, expected_level) in zip(doc.meta["headlines"], expectation):
|
||||
# Check if correct headline and level is extracted
|
||||
assert extracted_headline["headline"] == expected_headline
|
||||
assert extracted_headline["level"] == expected_level
|
||||
|
||||
# Check if correct start_idx is extracted
|
||||
if doc.content_type == "text":
|
||||
start_idx = extracted_headline["start_idx"]
|
||||
hl_len = len(extracted_headline["headline"])
|
||||
assert extracted_headline["headline"] == doc.content[start_idx : start_idx + hl_len]
|
||||
|
||||
|
||||
def test_id_hash_keys_from_pipeline_params():
|
||||
doc_path = SAMPLES_PATH / "docs" / "doc_1.txt"
|
||||
meta_1 = {"key": "a"}
|
||||
|
||||
BIN
test/samples/pdf/sample_pdf_4.pdf
Normal file
BIN
test/samples/pdf/sample_pdf_4.pdf
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user