mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-28 18:36:36 +00:00
fix: ParsrConverter list element added (#4562)
* fix: list element and mapping logic around it added to ParsrConverter convert step + unit test covering the specific mapping of list content from Parsr's to Haystack's * Code review changes * changed the samples path after conftest changes * added samples_path to function arg --------- Co-authored-by: Namoush <fmpereira22@gmail.com> Co-authored-by: Fernando Pereira <fernando.pereira@criticalsoftware.com> Co-authored-by: Mayank Jobanputra <mayankjobanputra@gmail.com> Co-authored-by: bogdankostic <bogdankostic@web.de>
This commit is contained in:
parent
1ac9ca7fac
commit
5d41e60d89
@ -173,7 +173,7 @@ class ParsrConverter(BaseConverter):
|
||||
headlines = []
|
||||
for page_idx, page in enumerate(parsr_output["pages"]):
|
||||
for elem_idx, element in enumerate(page["elements"]):
|
||||
if element["type"] in ["paragraph", "heading", "table-of-contents"]:
|
||||
if element["type"] in ["paragraph", "heading", "table-of-contents", "list"]:
|
||||
current_paragraph = self._convert_text_element(element)
|
||||
if current_paragraph:
|
||||
if element["type"] == "heading" and extract_headlines:
|
||||
@ -236,10 +236,9 @@ class ParsrConverter(BaseConverter):
|
||||
return ""
|
||||
if self.remove_page_footers and "isFooter" in element["properties"]:
|
||||
return ""
|
||||
if element["type"] == "table-of-contents":
|
||||
if self.remove_table_of_contents:
|
||||
if element["type"] in ["table-of-contents", "list"]:
|
||||
if self.remove_table_of_contents and element["type"] == "table-of-contents":
|
||||
return ""
|
||||
else:
|
||||
current_paragraph = "\n".join([self._get_paragraph_string(elem) for elem in element["content"]])
|
||||
return current_paragraph
|
||||
|
||||
|
@ -396,6 +396,20 @@ def test_parsr_converter_headline_extraction(samples_path):
|
||||
assert extracted_headline["headline"] == doc.content[start_idx : start_idx + hl_len]
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Parsr not running on Windows CI")
|
||||
def test_parsr_converter_list_mapping(samples_path):
|
||||
# This exact line(without line break characters) only exists in the list object we want to make sure it's being mapped correctly
|
||||
expected_list_line = "Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate."
|
||||
|
||||
converter = ParsrConverter()
|
||||
|
||||
docs = converter.convert(file_path=str((samples_path / "pdf" / "sample_pdf_4.pdf").absolute()))
|
||||
assert len(docs) == 2
|
||||
assert docs[1].content_type == "text"
|
||||
assert expected_list_line in docs[1].content
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_id_hash_keys_from_pipeline_params(samples_path):
|
||||
doc_path = samples_path / "docs" / "doc_1.txt"
|
||||
|
Loading…
x
Reference in New Issue
Block a user