mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-31 09:49:48 +00:00 
			
		
		
		
	fix: ParsrConverter list element added (#4562)
* fix: list element and mapping logic around it added to ParsrConverter convert step + unit test covering the specific mapping of list content from Parsr's to Haystack's * Code review changes * changed the samples path after conftest changes * added samples_path to function arg --------- Co-authored-by: Namoush <fmpereira22@gmail.com> Co-authored-by: Fernando Pereira <fernando.pereira@criticalsoftware.com> Co-authored-by: Mayank Jobanputra <mayankjobanputra@gmail.com> Co-authored-by: bogdankostic <bogdankostic@web.de>
This commit is contained in:
		
							parent
							
								
									1ac9ca7fac
								
							
						
					
					
						commit
						5d41e60d89
					
				| @ -173,7 +173,7 @@ class ParsrConverter(BaseConverter): | |||||||
|             headlines = [] |             headlines = [] | ||||||
|             for page_idx, page in enumerate(parsr_output["pages"]): |             for page_idx, page in enumerate(parsr_output["pages"]): | ||||||
|                 for elem_idx, element in enumerate(page["elements"]): |                 for elem_idx, element in enumerate(page["elements"]): | ||||||
|                     if element["type"] in ["paragraph", "heading", "table-of-contents"]: |                     if element["type"] in ["paragraph", "heading", "table-of-contents", "list"]: | ||||||
|                         current_paragraph = self._convert_text_element(element) |                         current_paragraph = self._convert_text_element(element) | ||||||
|                         if current_paragraph: |                         if current_paragraph: | ||||||
|                             if element["type"] == "heading" and extract_headlines: |                             if element["type"] == "heading" and extract_headlines: | ||||||
| @ -236,12 +236,11 @@ class ParsrConverter(BaseConverter): | |||||||
|             return "" |             return "" | ||||||
|         if self.remove_page_footers and "isFooter" in element["properties"]: |         if self.remove_page_footers and "isFooter" in element["properties"]: | ||||||
|             return "" |             return "" | ||||||
|         if element["type"] == "table-of-contents": |         if element["type"] in ["table-of-contents", "list"]: | ||||||
|             if self.remove_table_of_contents: |             if self.remove_table_of_contents and element["type"] == "table-of-contents": | ||||||
|                 return "" |                 return "" | ||||||
|             else: |             current_paragraph = "\n".join([self._get_paragraph_string(elem) for elem in element["content"]]) | ||||||
|                 current_paragraph = "\n".join([self._get_paragraph_string(elem) for elem in element["content"]]) |             return current_paragraph | ||||||
|                 return current_paragraph |  | ||||||
| 
 | 
 | ||||||
|         current_paragraph = self._get_paragraph_string(element) |         current_paragraph = self._get_paragraph_string(element) | ||||||
|         return current_paragraph |         return current_paragraph | ||||||
|  | |||||||
| @ -396,6 +396,20 @@ def test_parsr_converter_headline_extraction(samples_path): | |||||||
|                 assert extracted_headline["headline"] == doc.content[start_idx : start_idx + hl_len] |                 assert extracted_headline["headline"] == doc.content[start_idx : start_idx + hl_len] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.mark.integration | ||||||
|  | @pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Parsr not running on Windows CI") | ||||||
|  | def test_parsr_converter_list_mapping(samples_path): | ||||||
|  |     # This exact line(without line break characters) only exists in the list object we want to make sure it's being mapped correctly | ||||||
|  |     expected_list_line = "Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate." | ||||||
|  | 
 | ||||||
|  |     converter = ParsrConverter() | ||||||
|  | 
 | ||||||
|  |     docs = converter.convert(file_path=str((samples_path / "pdf" / "sample_pdf_4.pdf").absolute())) | ||||||
|  |     assert len(docs) == 2 | ||||||
|  |     assert docs[1].content_type == "text" | ||||||
|  |     assert expected_list_line in docs[1].content | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| @pytest.mark.unit | @pytest.mark.unit | ||||||
| def test_id_hash_keys_from_pipeline_params(samples_path): | def test_id_hash_keys_from_pipeline_params(samples_path): | ||||||
|     doc_path = samples_path / "docs" / "doc_1.txt" |     doc_path = samples_path / "docs" / "doc_1.txt" | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Fernando Pereira
						Fernando Pereira