From 5d41e60d89c51a5f4be2a4f9fc1da3b7a31f2b82 Mon Sep 17 00:00:00 2001
From: Fernando Pereira <fmrpereira22@gmail.com>
Date: Wed, 12 Apr 2023 14:08:21 +0100
Subject: [PATCH] fix: ParsrConverter list element added (#4562)

* fix: list element and mapping logic around it added to ParsrConverter convert step + unit test covering the specific mapping of list content from Parsr's to Haystack's

* Code review changes

* changed the samples path after conftest changes

* added samples_path to function arg

---------

Co-authored-by: Namoush <fmpereira22@gmail.com>
Co-authored-by: Fernando Pereira <fernando.pereira@criticalsoftware.com>
Co-authored-by: Mayank Jobanputra <mayankjobanputra@gmail.com>
Co-authored-by: bogdankostic <bogdankostic@web.de>
---
 haystack/nodes/file_converter/parsr.py | 11 +++++------
 test/nodes/test_file_converter.py      | 14 ++++++++++++++
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/haystack/nodes/file_converter/parsr.py b/haystack/nodes/file_converter/parsr.py
index e4036be04..d307b3dac 100644
--- a/haystack/nodes/file_converter/parsr.py
+++ b/haystack/nodes/file_converter/parsr.py
@@ -173,7 +173,7 @@ class ParsrConverter(BaseConverter):
             headlines = []
             for page_idx, page in enumerate(parsr_output["pages"]):
                 for elem_idx, element in enumerate(page["elements"]):
-                    if element["type"] in ["paragraph", "heading", "table-of-contents"]:
+                    if element["type"] in ["paragraph", "heading", "table-of-contents", "list"]:
                         current_paragraph = self._convert_text_element(element)
                         if current_paragraph:
                             if element["type"] == "heading" and extract_headlines:
@@ -236,12 +236,11 @@ class ParsrConverter(BaseConverter):
             return ""
         if self.remove_page_footers and "isFooter" in element["properties"]:
             return ""
-        if element["type"] == "table-of-contents":
-            if self.remove_table_of_contents:
+        if element["type"] in ["table-of-contents", "list"]:
+            if self.remove_table_of_contents and element["type"] == "table-of-contents":
                 return ""
-            else:
-                current_paragraph = "\n".join([self._get_paragraph_string(elem) for elem in element["content"]])
-                return current_paragraph
+            current_paragraph = "\n".join([self._get_paragraph_string(elem) for elem in element["content"]])
+            return current_paragraph
 
         current_paragraph = self._get_paragraph_string(element)
         return current_paragraph
diff --git a/test/nodes/test_file_converter.py b/test/nodes/test_file_converter.py
index e8535602e..64c182a9d 100644
--- a/test/nodes/test_file_converter.py
+++ b/test/nodes/test_file_converter.py
@@ -396,6 +396,20 @@ def test_parsr_converter_headline_extraction(samples_path):
                 assert extracted_headline["headline"] == doc.content[start_idx : start_idx + hl_len]
 
 
+@pytest.mark.integration
+@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Parsr not running on Windows CI")
+def test_parsr_converter_list_mapping(samples_path):
+    # This exact line(without line break characters) only exists in the list object we want to make sure it's being mapped correctly
+    expected_list_line = "Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate."
+
+    converter = ParsrConverter()
+
+    docs = converter.convert(file_path=str((samples_path / "pdf" / "sample_pdf_4.pdf").absolute()))
+    assert len(docs) == 2
+    assert docs[1].content_type == "text"
+    assert expected_list_line in docs[1].content
+
+
 @pytest.mark.unit
 def test_id_hash_keys_from_pipeline_params(samples_path):
     doc_path = samples_path / "docs" / "doc_1.txt"