feat: Add headline extraction to ParsrConverter (#3488)

* Add headline extraction to ParsrConverter * Add sample PDF file * Add test * Use extract_headlines if set in convert method * Integrate PR feedback
2026-01-06 03:57:19 +00:00 · 2022-10-31 19:00:02 +01:00 · 2022-10-31 19:00:02 +01:00 · 60224412bc
commit 60224412bc
parent 8ddeda811a
3 changed files with 72 additions and 3 deletions
--- a/haystack/nodes/file_converter/parsr.py
+++ b/haystack/nodes/file_converter/parsr.py
@ -1,5 +1,5 @@
 # pylint: disable=missing-timeout
-
+import sys
 from typing import Optional, Dict, List, Any

 try:
@ -44,6 +44,7 @@ class ParsrConverter(BaseConverter):
        valid_languages: Optional[List[str]] = None,
        id_hash_keys: Optional[List[str]] = None,
        add_page_number: bool = True,
+        extract_headlines: bool = True,
    ):
        """
        :param parsr_url: URL endpoint to Parsr"s REST API.
@ -69,6 +70,7 @@ class ParsrConverter(BaseConverter):
            In this case the id will be generated by using the content and the defined metadata.
        :param add_page_number: Adds the number of the page a table occurs in to the Document's meta field
                                `"page"`.
+        :param extract_headlines: Whether to extract headings from the PDF file.
        """
        super().__init__(valid_languages=valid_languages, id_hash_keys=id_hash_keys)

@ -96,6 +98,7 @@ class ParsrConverter(BaseConverter):
        self.remove_page_footers = remove_page_footers
        self.remove_table_of_contents = remove_table_of_contents
        self.add_page_number = add_page_number
+        self.extract_headlines = extract_headlines
        super().__init__(valid_languages=valid_languages)

    def convert(
@ -106,6 +109,7 @@ class ParsrConverter(BaseConverter):
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = "utf-8",
        id_hash_keys: Optional[List[str]] = None,
+        extract_headlines: Optional[bool] = None,
    ) -> List[Document]:
        """
        Extract text and tables from a PDF or DOCX using the open-source Parsr tool.
@ -124,12 +128,16 @@ class ParsrConverter(BaseConverter):
            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
            In this case the id will be generated by using the content and the defined metadata.
+        :param extract_headlines: Whether to extract headings from the PDF file.
        """
        if valid_languages is None:
            valid_languages = self.valid_languages
-
        if id_hash_keys is None:
            id_hash_keys = self.id_hash_keys
+        if extract_headlines is None:
+            extract_headlines = self.extract_headlines
+        if meta is None:
+            meta = {}

        with open(file_path, "rb") as pdf_file:
            # Send file to Parsr
@ -154,16 +162,28 @@ class ParsrConverter(BaseConverter):
            # Convert Parsr output to Haystack Documents
            text = ""
            tables = []
+            headlines = []
            for page_idx, page in enumerate(parsr_output["pages"]):
                for elem_idx, element in enumerate(page["elements"]):
                    if element["type"] in ["paragraph", "heading", "table-of-contents"]:
                        current_paragraph = self._convert_text_element(element)
                        if current_paragraph:
+                            if element["type"] == "heading" and extract_headlines:
+                                headlines.append(
+                                    {"headline": current_paragraph, "start_idx": len(text), "level": element["level"]}
+                                )
                            text += f"{current_paragraph}\n\n"

                    elif element["type"] == "table":
                        table = self._convert_table_element(
-                            element, parsr_output["pages"], page_idx, elem_idx, meta, id_hash_keys
+                            element,
+                            parsr_output["pages"],
+                            page_idx,
+                            elem_idx,
+                            headlines,
+                            extract_headlines,
+                            meta,
+                            id_hash_keys,
                        )
                        tables.append(table)
                if text[-1] != "\f":
@ -184,6 +204,9 @@ class ParsrConverter(BaseConverter):
                    f"been decoded in the correct text format."
                )

+        if extract_headlines:
+            meta["headlines"] = headlines
+
        docs = tables + [Document(content=text.strip(), meta=meta, id_hash_keys=id_hash_keys)]
        return docs

@ -219,6 +242,8 @@ class ParsrConverter(BaseConverter):
        all_pages: List[Dict],
        page_idx: int,
        elem_idx: int,
+        headlines: List[Dict],
+        extract_headlines: bool,
        meta: Optional[Dict[str, Any]] = None,
        id_hash_keys: Optional[List[str]] = None,
    ) -> Document:
@ -290,5 +315,17 @@ class ParsrConverter(BaseConverter):
        if self.add_page_number:
            table_meta["page"] = page_idx + 1

+        if extract_headlines:
+            relevant_headlines = []
+            cur_lowest_headline_level = sys.maxsize
+            for headline in reversed(headlines):
+                if headline["level"] < cur_lowest_headline_level:
+                    headline_copy = copy.deepcopy(headline)
+                    headline_copy["start_idx"] = None
+                    relevant_headlines.append(headline_copy)
+                    cur_lowest_headline_level = headline_copy["level"]
+            relevant_headlines = relevant_headlines[::-1]
+            table_meta["headlines"] = relevant_headlines
+
        table_df = pd.DataFrame(columns=table_list[0], data=table_list[1:])
        return Document(content=table_df, content_type="table", meta=table_meta, id_hash_keys=id_hash_keys)
--- a/test/nodes/test_file_converter.py
+++ b/test/nodes/test_file_converter.py
@ -220,6 +220,38 @@ def test_parsr_converter():
    assert docs[1].content.endswith("Page 4 of Sample PDF\n… the page 3 is empty.")


+@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Parsr not running on Windows CI")
+def test_parsr_converter_headline_extraction():
+    expected_headlines = [
+        [("Lorem ipsum", 1), ("Cras fringilla ipsum magna, in fringilla dui commodo\na.", 2)],
+        [
+            ("Lorem ipsum", 1),
+            ("Lorem ipsum dolor sit amet, consectetur adipiscing\nelit. Nunc ac faucibus odio.", 2),
+            ("Cras fringilla ipsum magna, in fringilla dui commodo\na.", 2),
+            ("Lorem ipsum dolor sit amet, consectetur adipiscing\nelit.", 2),
+            ("Maecenas mauris lectus, lobortis et purus mattis, blandit\ndictum tellus.", 2),
+            ("In eleifend velit vitae libero sollicitudin euismod.", 2),
+        ],
+    ]
+
+    converter = ParsrConverter()
+
+    docs = converter.convert(file_path=str((SAMPLES_PATH / "pdf" / "sample_pdf_4.pdf").absolute()))
+    assert len(docs) == 2
+
+    for doc, expectation in zip(docs, expected_headlines):
+        for extracted_headline, (expected_headline, expected_level) in zip(doc.meta["headlines"], expectation):
+            # Check if correct headline and level is extracted
+            assert extracted_headline["headline"] == expected_headline
+            assert extracted_headline["level"] == expected_level
+
+            # Check if correct start_idx is extracted
+            if doc.content_type == "text":
+                start_idx = extracted_headline["start_idx"]
+                hl_len = len(extracted_headline["headline"])
+                assert extracted_headline["headline"] == doc.content[start_idx : start_idx + hl_len]
+
+
 def test_id_hash_keys_from_pipeline_params():
    doc_path = SAMPLES_PATH / "docs" / "doc_1.txt"
    meta_1 = {"key": "a"}
--- a/test/samples/pdf/sample_pdf_4.pdf
+++ b/test/samples/pdf/sample_pdf_4.pdf