diff --git a/haystack/nodes/file_converter/parsr.py b/haystack/nodes/file_converter/parsr.py index a13d609f0..fee43fe09 100644 --- a/haystack/nodes/file_converter/parsr.py +++ b/haystack/nodes/file_converter/parsr.py @@ -1,5 +1,5 @@ # pylint: disable=missing-timeout - +import sys from typing import Optional, Dict, List, Any try: @@ -44,6 +44,7 @@ class ParsrConverter(BaseConverter): valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, add_page_number: bool = True, + extract_headlines: bool = True, ): """ :param parsr_url: URL endpoint to Parsr"s REST API. @@ -69,6 +70,7 @@ class ParsrConverter(BaseConverter): In this case the id will be generated by using the content and the defined metadata. :param add_page_number: Adds the number of the page a table occurs in to the Document's meta field `"page"`. + :param extract_headlines: Whether to extract headings from the PDF file. """ super().__init__(valid_languages=valid_languages, id_hash_keys=id_hash_keys) @@ -96,6 +98,7 @@ class ParsrConverter(BaseConverter): self.remove_page_footers = remove_page_footers self.remove_table_of_contents = remove_table_of_contents self.add_page_number = add_page_number + self.extract_headlines = extract_headlines super().__init__(valid_languages=valid_languages) def convert( @@ -106,6 +109,7 @@ class ParsrConverter(BaseConverter): valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None, + extract_headlines: Optional[bool] = None, ) -> List[Document]: """ Extract text and tables from a PDF or DOCX using the open-source Parsr tool. @@ -124,12 +128,16 @@ class ParsrConverter(BaseConverter): attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). In this case the id will be generated by using the content and the defined metadata. + :param extract_headlines: Whether to extract headings from the PDF file. """ if valid_languages is None: valid_languages = self.valid_languages - if id_hash_keys is None: id_hash_keys = self.id_hash_keys + if extract_headlines is None: + extract_headlines = self.extract_headlines + if meta is None: + meta = {} with open(file_path, "rb") as pdf_file: # Send file to Parsr @@ -154,16 +162,28 @@ class ParsrConverter(BaseConverter): # Convert Parsr output to Haystack Documents text = "" tables = [] + headlines = [] for page_idx, page in enumerate(parsr_output["pages"]): for elem_idx, element in enumerate(page["elements"]): if element["type"] in ["paragraph", "heading", "table-of-contents"]: current_paragraph = self._convert_text_element(element) if current_paragraph: + if element["type"] == "heading" and extract_headlines: + headlines.append( + {"headline": current_paragraph, "start_idx": len(text), "level": element["level"]} + ) text += f"{current_paragraph}\n\n" elif element["type"] == "table": table = self._convert_table_element( - element, parsr_output["pages"], page_idx, elem_idx, meta, id_hash_keys + element, + parsr_output["pages"], + page_idx, + elem_idx, + headlines, + extract_headlines, + meta, + id_hash_keys, ) tables.append(table) if text[-1] != "\f": @@ -184,6 +204,9 @@ class ParsrConverter(BaseConverter): f"been decoded in the correct text format." ) + if extract_headlines: + meta["headlines"] = headlines + docs = tables + [Document(content=text.strip(), meta=meta, id_hash_keys=id_hash_keys)] return docs @@ -219,6 +242,8 @@ class ParsrConverter(BaseConverter): all_pages: List[Dict], page_idx: int, elem_idx: int, + headlines: List[Dict], + extract_headlines: bool, meta: Optional[Dict[str, Any]] = None, id_hash_keys: Optional[List[str]] = None, ) -> Document: @@ -290,5 +315,17 @@ class ParsrConverter(BaseConverter): if self.add_page_number: table_meta["page"] = page_idx + 1 + if extract_headlines: + relevant_headlines = [] + cur_lowest_headline_level = sys.maxsize + for headline in reversed(headlines): + if headline["level"] < cur_lowest_headline_level: + headline_copy = copy.deepcopy(headline) + headline_copy["start_idx"] = None + relevant_headlines.append(headline_copy) + cur_lowest_headline_level = headline_copy["level"] + relevant_headlines = relevant_headlines[::-1] + table_meta["headlines"] = relevant_headlines + table_df = pd.DataFrame(columns=table_list[0], data=table_list[1:]) return Document(content=table_df, content_type="table", meta=table_meta, id_hash_keys=id_hash_keys) diff --git a/test/nodes/test_file_converter.py b/test/nodes/test_file_converter.py index 1cffe6b55..d515287b1 100644 --- a/test/nodes/test_file_converter.py +++ b/test/nodes/test_file_converter.py @@ -220,6 +220,38 @@ def test_parsr_converter(): assert docs[1].content.endswith("Page 4 of Sample PDF\n… the page 3 is empty.") +@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Parsr not running on Windows CI") +def test_parsr_converter_headline_extraction(): + expected_headlines = [ + [("Lorem ipsum", 1), ("Cras fringilla ipsum magna, in fringilla dui commodo\na.", 2)], + [ + ("Lorem ipsum", 1), + ("Lorem ipsum dolor sit amet, consectetur adipiscing\nelit. Nunc ac faucibus odio.", 2), + ("Cras fringilla ipsum magna, in fringilla dui commodo\na.", 2), + ("Lorem ipsum dolor sit amet, consectetur adipiscing\nelit.", 2), + ("Maecenas mauris lectus, lobortis et purus mattis, blandit\ndictum tellus.", 2), + ("In eleifend velit vitae libero sollicitudin euismod.", 2), + ], + ] + + converter = ParsrConverter() + + docs = converter.convert(file_path=str((SAMPLES_PATH / "pdf" / "sample_pdf_4.pdf").absolute())) + assert len(docs) == 2 + + for doc, expectation in zip(docs, expected_headlines): + for extracted_headline, (expected_headline, expected_level) in zip(doc.meta["headlines"], expectation): + # Check if correct headline and level is extracted + assert extracted_headline["headline"] == expected_headline + assert extracted_headline["level"] == expected_level + + # Check if correct start_idx is extracted + if doc.content_type == "text": + start_idx = extracted_headline["start_idx"] + hl_len = len(extracted_headline["headline"]) + assert extracted_headline["headline"] == doc.content[start_idx : start_idx + hl_len] + + def test_id_hash_keys_from_pipeline_params(): doc_path = SAMPLES_PATH / "docs" / "doc_1.txt" meta_1 = {"key": "a"} diff --git a/test/samples/pdf/sample_pdf_4.pdf b/test/samples/pdf/sample_pdf_4.pdf new file mode 100644 index 000000000..94d947797 Binary files /dev/null and b/test/samples/pdf/sample_pdf_4.pdf differ