diff --git a/haystack/nodes/file_converter/parsr.py b/haystack/nodes/file_converter/parsr.py index e4036be04..d307b3dac 100644 --- a/haystack/nodes/file_converter/parsr.py +++ b/haystack/nodes/file_converter/parsr.py @@ -173,7 +173,7 @@ class ParsrConverter(BaseConverter): headlines = [] for page_idx, page in enumerate(parsr_output["pages"]): for elem_idx, element in enumerate(page["elements"]): - if element["type"] in ["paragraph", "heading", "table-of-contents"]: + if element["type"] in ["paragraph", "heading", "table-of-contents", "list"]: current_paragraph = self._convert_text_element(element) if current_paragraph: if element["type"] == "heading" and extract_headlines: @@ -236,12 +236,11 @@ class ParsrConverter(BaseConverter): return "" if self.remove_page_footers and "isFooter" in element["properties"]: return "" - if element["type"] == "table-of-contents": - if self.remove_table_of_contents: + if element["type"] in ["table-of-contents", "list"]: + if self.remove_table_of_contents and element["type"] == "table-of-contents": return "" - else: - current_paragraph = "\n".join([self._get_paragraph_string(elem) for elem in element["content"]]) - return current_paragraph + current_paragraph = "\n".join([self._get_paragraph_string(elem) for elem in element["content"]]) + return current_paragraph current_paragraph = self._get_paragraph_string(element) return current_paragraph diff --git a/test/nodes/test_file_converter.py b/test/nodes/test_file_converter.py index e8535602e..64c182a9d 100644 --- a/test/nodes/test_file_converter.py +++ b/test/nodes/test_file_converter.py @@ -396,6 +396,20 @@ def test_parsr_converter_headline_extraction(samples_path): assert extracted_headline["headline"] == doc.content[start_idx : start_idx + hl_len] +@pytest.mark.integration +@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Parsr not running on Windows CI") +def test_parsr_converter_list_mapping(samples_path): + # This exact line(without line break characters) only exists in the list object we want to make sure it's being mapped correctly + expected_list_line = "Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate." + + converter = ParsrConverter() + + docs = converter.convert(file_path=str((samples_path / "pdf" / "sample_pdf_4.pdf").absolute())) + assert len(docs) == 2 + assert docs[1].content_type == "text" + assert expected_list_line in docs[1].content + + @pytest.mark.unit def test_id_hash_keys_from_pipeline_params(samples_path): doc_path = samples_path / "docs" / "doc_1.txt"