mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-31 01:39:45 +00:00 
			
		
		
		
	 01ea4bf21f
			
		
	
	
		01ea4bf21f
		
			
		
	
	
	
	
		
			
			* Change default encoding for PDFToTextConverter * Update Documentation & Code Style * Improve docstring * Update Documentation & Code Style * Add list of ligatures to ignore and add the possibility to modify such list at need * Add docstring * Add tests * Rename parameter * Update Documentation & Code Style * Move implementation into the base converter to make mypy happier * Update Documentation & Code Style * mypy and pylint * mypy * move encoding parameter to init of PDFToTextConverter * Update Documentation & Code Style * make utf8 default and fix mypy * Update Documentation & Code Style * Update Documentation & Code Style * remove note on encoding in tutorial8 * Update Documentation & Code Style * skip OCRConverter and test converter.run * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai>
		
			
				
	
	
		
			154 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			154 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from pathlib import Path
 | ||
| import os
 | ||
| 
 | ||
| import pytest
 | ||
| 
 | ||
| from haystack.nodes import (
 | ||
|     MarkdownConverter,
 | ||
|     DocxToTextConverter,
 | ||
|     PDFToTextConverter,
 | ||
|     PDFToTextOCRConverter,
 | ||
|     TikaConverter,
 | ||
|     AzureConverter,
 | ||
|     ParsrConverter,
 | ||
| )
 | ||
| 
 | ||
| from .conftest import SAMPLES_PATH
 | ||
| 
 | ||
| 
 | ||
| @pytest.mark.tika
 | ||
| @pytest.mark.parametrize(
 | ||
|     # "Converter", [PDFToTextConverter, TikaConverter, PDFToTextOCRConverter]
 | ||
|     "Converter",
 | ||
|     [PDFToTextOCRConverter],
 | ||
| )
 | ||
| def test_convert(Converter):
 | ||
|     converter = Converter()
 | ||
|     document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")[0]
 | ||
|     pages = document.content.split("\f")
 | ||
|     assert len(pages) == 4  # the sample PDF file has four pages.
 | ||
|     assert pages[0] != ""  # the page 1 of PDF contains text.
 | ||
|     assert pages[2] == ""  # the page 3 of PDF file is empty.
 | ||
|     # assert text is retained from the document.
 | ||
|     # As whitespace can differ (\n," ", etc.), we standardize all to simple whitespace
 | ||
|     page_standard_whitespace = " ".join(pages[0].split())
 | ||
|     assert "Adobe Systems made the PDF specification available free of charge in 1993." in page_standard_whitespace
 | ||
| 
 | ||
| 
 | ||
| @pytest.mark.parametrize("Converter", [PDFToTextConverter])  # TODO PDFToTextOCRConverter should pass this test too
 | ||
| def test_pdf_encoding(Converter):
 | ||
|     converter = Converter()
 | ||
| 
 | ||
|     document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
 | ||
|     assert "ɪ" in document.content
 | ||
| 
 | ||
|     document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", encoding="Latin1")[0]["documents"][0]
 | ||
|     assert "ɪ" not in document.content
 | ||
| 
 | ||
| 
 | ||
| @pytest.mark.parametrize("Converter", [PDFToTextConverter])  # TODO PDFToTextOCRConverter should pass this test too
 | ||
| def test_pdf_ligatures(Converter):
 | ||
|     converter = Converter()
 | ||
| 
 | ||
|     document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
 | ||
|     assert "ff" not in document.content
 | ||
|     assert "ɪ" in document.content
 | ||
| 
 | ||
|     document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", known_ligatures={})[0]["documents"][
 | ||
|         0
 | ||
|     ]
 | ||
|     assert "ff" in document.content
 | ||
|     assert "ɪ" in document.content
 | ||
| 
 | ||
|     document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", known_ligatures={"ɪ": "i"})[0][
 | ||
|         "documents"
 | ||
|     ][0]
 | ||
|     assert "ff" in document.content
 | ||
|     assert "ɪ" not in document.content
 | ||
| 
 | ||
| 
 | ||
| @pytest.mark.tika
 | ||
| @pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
 | ||
| def test_table_removal(Converter):
 | ||
|     converter = Converter(remove_numeric_tables=True)
 | ||
|     document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")[0]
 | ||
|     pages = document.content.split("\f")
 | ||
|     # assert numeric rows are removed from the table.
 | ||
|     assert "324" not in pages[0]
 | ||
|     assert "54x growth" not in pages[0]
 | ||
| 
 | ||
| 
 | ||
| @pytest.mark.tika
 | ||
| @pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
 | ||
| def test_language_validation(Converter, caplog):
 | ||
|     converter = Converter(valid_languages=["en"])
 | ||
|     converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
 | ||
|     assert "samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text
 | ||
| 
 | ||
|     converter = Converter(valid_languages=["de"])
 | ||
|     converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
 | ||
|     assert "samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text
 | ||
| 
 | ||
| 
 | ||
| def test_docx_converter():
 | ||
|     converter = DocxToTextConverter()
 | ||
|     document = converter.convert(file_path=SAMPLES_PATH / "docx" / "sample_docx.docx")[0]
 | ||
|     assert document.content.startswith("Sample Docx File")
 | ||
| 
 | ||
| 
 | ||
| def test_markdown_converter():
 | ||
|     converter = MarkdownConverter()
 | ||
|     document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
 | ||
|     assert document.content.startswith("What to build with Haystack")
 | ||
| 
 | ||
| 
 | ||
| def test_azure_converter():
 | ||
|     # Check if Form Recognizer endpoint and credential key in environment variables
 | ||
|     if "AZURE_FORMRECOGNIZER_ENDPOINT" in os.environ and "AZURE_FORMRECOGNIZER_KEY" in os.environ:
 | ||
|         converter = AzureConverter(
 | ||
|             endpoint=os.environ["AZURE_FORMRECOGNIZER_ENDPOINT"],
 | ||
|             credential_key=os.environ["AZURE_FORMRECOGNIZER_KEY"],
 | ||
|             save_json=True,
 | ||
|         )
 | ||
| 
 | ||
|         docs = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
 | ||
|         assert len(docs) == 2
 | ||
|         assert docs[0].content_type == "table"
 | ||
|         assert docs[0].content.shape[0] == 4  # number of rows
 | ||
|         assert docs[0].content.shape[1] == 5  # number of columns, Form Recognizer assumes there are 5 columns
 | ||
|         assert list(docs[0].content.columns) == ["", "Column 1", "", "Column 2", "Column 3"]
 | ||
|         assert list(docs[0].content.iloc[3]) == ["D", "$54.35", "", "$6345.", ""]
 | ||
|         assert (
 | ||
|             docs[0].meta["preceding_context"] == "specification. These proprietary technologies are not "
 | ||
|             "standardized and their\nspecification is published only on "
 | ||
|             "Adobe's website. Many of them are also not\nsupported by "
 | ||
|             "popular third-party implementations of PDF."
 | ||
|         )
 | ||
|         assert docs[0].meta["following_context"] == ""
 | ||
| 
 | ||
|         assert docs[1].content_type == "text"
 | ||
|         assert docs[1].content.startswith("A sample PDF file")
 | ||
| 
 | ||
| 
 | ||
| def test_parsr_converter():
 | ||
|     converter = ParsrConverter()
 | ||
| 
 | ||
|     docs = converter.convert(file_path=str((SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf").absolute()))
 | ||
|     assert len(docs) == 2
 | ||
|     assert docs[0].content_type == "table"
 | ||
|     assert docs[0].content.shape[0] == 4  # number of rows
 | ||
|     assert docs[0].content.shape[1] == 4
 | ||
|     assert list(docs[0].content.columns) == ["", "Column 1", "Column 2", "Column 3"]
 | ||
|     assert list(docs[0].content.iloc[3]) == ["D", "$54.35", "$6345.", ""]
 | ||
|     assert (
 | ||
|         docs[0].meta["preceding_context"] == "specification. These proprietary technologies are not "
 | ||
|         "standardized and their\nspecification is published only on "
 | ||
|         "Adobe's website. Many of them are also not\nsupported by popular "
 | ||
|         "third-party implementations of PDF."
 | ||
|     )
 | ||
|     assert docs[0].meta["following_context"] == ""
 | ||
| 
 | ||
|     assert docs[1].content_type == "text"
 | ||
|     assert docs[1].content.startswith("A sample PDF file")
 | ||
|     assert docs[1].content.endswith("Page 4 of Sample PDF\n… the page 3 is empty.")
 |