From 075ed7fbcb70bcefc01da5c90c26a7654d02ed0a Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Tue, 24 May 2022 11:31:32 +0200 Subject: [PATCH] Remove encoding option from PDFToTextOCRConverter (#2553) * remove encoding option from PDFToTextOCRConverter * Update Documentation & Code Style * add unused 'encoding' param to PDFToTextOCRConverter * Update Documentation & Code Style * call run instead of convert to use ligature replacing * Update Documentation & Code Style * add text to check installed poppler version * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/_src/api/api/file_converter.md | 7 +++--- haystack/nodes/file_converter/image.py | 3 ++- haystack/nodes/file_converter/pdf.py | 6 ++--- test/nodes/test_file_converter.py | 33 +++++++++++++++++++------- 4 files changed, 33 insertions(+), 16 deletions(-) diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md index c39a711a4..f6ef56fc5 100644 --- a/docs/_src/api/api/file_converter.md +++ b/docs/_src/api/api/file_converter.md @@ -204,7 +204,7 @@ In this case the id will be generated by using the content and the defined metad #### ImageToTextConverter.convert ```python -def convert(file_path: Union[Path, str], meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Union[Path, str], meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract) @@ -224,6 +224,7 @@ The rows containing strings are thus retained in this option. This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text. +- `encoding`: Not applicable - `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). @@ -390,7 +391,7 @@ In this case the id will be generated by using the content and the defined metad #### PDFToTextOCRConverter.convert ```python -def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Convert a file to a dictionary containing the text and any associated meta data. @@ -412,7 +413,7 @@ The rows containing strings are thus retained in this option. This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text. -- `encoding`: Select the file encoding (default is `UTF-8`) +- `encoding`: Not applicable - `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). diff --git a/haystack/nodes/file_converter/image.py b/haystack/nodes/file_converter/image.py index ba136df5c..a50cccf42 100644 --- a/haystack/nodes/file_converter/image.py +++ b/haystack/nodes/file_converter/image.py @@ -88,7 +88,7 @@ class ImageToTextConverter(BaseConverter): meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, - encoding: Optional[str] = "utf-8", + encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None, ) -> List[Document]: """ @@ -107,6 +107,7 @@ class ImageToTextConverter(BaseConverter): This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text. + :param encoding: Not applicable :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). diff --git a/haystack/nodes/file_converter/pdf.py b/haystack/nodes/file_converter/pdf.py index ab2fcb518..e6d89c35e 100644 --- a/haystack/nodes/file_converter/pdf.py +++ b/haystack/nodes/file_converter/pdf.py @@ -215,7 +215,7 @@ class PDFToTextOCRConverter(BaseConverter): meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, - encoding: Optional[str] = "UTF-8", + encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None, ) -> List[Document]: """ @@ -236,7 +236,7 @@ class PDFToTextOCRConverter(BaseConverter): This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text. - :param encoding: Select the file encoding (default is `UTF-8`) + :param encoding: Not applicable :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). @@ -251,7 +251,7 @@ class PDFToTextOCRConverter(BaseConverter): for image in images: temp_img = tempfile.NamedTemporaryFile(dir=os.path.dirname(os.path.realpath(__file__)), suffix=".jpeg") image.save(temp_img.name) - pages.append(self.image_2_text.convert(file_path=temp_img.name, encoding=encoding)[0].content) + pages.append(self.image_2_text.convert(file_path=temp_img.name)[0].content) except Exception as exception: logger.error(f"File {file_path} has an error \n {exception}") diff --git a/test/nodes/test_file_converter.py b/test/nodes/test_file_converter.py index f79afd52a..3e3234d0d 100644 --- a/test/nodes/test_file_converter.py +++ b/test/nodes/test_file_converter.py @@ -1,5 +1,5 @@ -from pathlib import Path import os +import subprocess import pytest @@ -17,15 +17,16 @@ from ..conftest import SAMPLES_PATH @pytest.mark.tika -@pytest.mark.parametrize( - # "Converter", [PDFToTextConverter, TikaConverter, PDFToTextOCRConverter] - "Converter", - [PDFToTextOCRConverter], -) +@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter, PDFToTextOCRConverter]) def test_convert(Converter): converter = Converter() - document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")[0] + document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")[0]["documents"][0] pages = document.content.split("\f") + + assert ( + len(pages) != 1 and pages[0] != "" + ), f'{type(converter).__name__} did return a single empty page indicating a potential issue with your installed poppler version. Try installing via "conda install -c conda-forge poppler" and check test_pdftoppm_command_format()' + assert len(pages) == 4 # the sample PDF file has four pages. assert pages[0] != "" # the page 1 of PDF contains text. assert pages[2] == "" # the page 3 of PDF file is empty. @@ -35,7 +36,21 @@ def test_convert(Converter): assert "Adobe Systems made the PDF specification available free of charge in 1993." in page_standard_whitespace -@pytest.mark.parametrize("Converter", [PDFToTextConverter]) # TODO PDFToTextOCRConverter should pass this test too +def test_pdftoppm_command_format(): + # Haystack's PDFToTextOCRConverter uses pdf2image, which calls pdftoppm internally. + # Some installations of pdftoppm are incompatible with Haystack and won't raise an error but just return empty converted documents + # This test runs pdftoppm directly to check whether pdftoppm accepts the command format that pdf2image uses in Haystack + proc = subprocess.Popen( + ["pdftoppm", f"{SAMPLES_PATH}/pdf/sample_pdf_1.pdf"], stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + out, err = proc.communicate() + # If usage info of pdftoppm is sent to stderr then it's because Haystack's pdf2image uses an incompatible command format + assert ( + not err + ), 'Your installation of poppler is incompatible with Haystack. Try installing via "conda install -c conda-forge poppler"' + + +@pytest.mark.parametrize("Converter", [PDFToTextConverter]) def test_pdf_encoding(Converter): converter = Converter() @@ -46,7 +61,7 @@ def test_pdf_encoding(Converter): assert "ɪ" not in document.content -@pytest.mark.parametrize("Converter", [PDFToTextConverter]) # TODO PDFToTextOCRConverter should pass this test too +@pytest.mark.parametrize("Converter", [PDFToTextConverter]) def test_pdf_ligatures(Converter): converter = Converter()