Add ImageToTextConverter and PDFToTextOCRConverter that utilize OCR (#1349)

* add image.py converter * add PDFtoImageConverter * add init to PDFtoImageConverter and classes to __init__ * update imagetotext pipeline * update imagetotext pipeline * update imagetotext pipeline * update imagetotext pipeline * update imagetotext pipeline * update imagetotext pipeline * update imagetotext pipeline * revert change in base.py in file_conv * Update base.py * Update pdf.py * add ocr file_converter testcase & update dockerfile * fix tesseract exception message typo * fix _image_to_text doctstring * add tesseract installation to CI * add tesseract installation to CI * add content test for PDF OCR converter * update PDFToTextOCRConverter constructor doctsring * replace image files with tmp paths for image.py convert * replace image files with tmp paths for image.py convert * Update README.md Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
2026-01-05 19:47:45 +00:00 · 2021-09-01 16:42:25 +02:00 · 2021-09-01 16:42:25 +02:00 · 4822536886
commit 4822536886
parent 1d2252e96d
10 changed files with 326 additions and 33 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -91,5 +91,8 @@ jobs:
    - name: Install pdftotext
      run: wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.03.tar.gz && tar -xvf xpdf-tools-linux-4.03.tar.gz && sudo cp xpdf-tools-linux-4.03/bin64/pdftotext /usr/local/bin

+  - name: Install tesseract
+      run: sudo apt-get install tesseract-ocr libtesseract-dev poppler-utils
+
    - name: Run tests
      run: cd test && pytest -s ${{ matrix.test-path }}
--- a/3
+++ b/3
@ -10,6 +10,9 @@ RUN wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.03.

 RUN apt-get install libpoppler-cpp-dev pkg-config -y --fix-missing

+# Install Tesseract
+RUN apt-get install tesseract-ocr libtesseract-dev poppler-utils
+
 # copy code
 COPY haystack /home/user/haystack

--- a/README.md
+++ b/README.md
@ -316,16 +316,17 @@ Please also refer to our [documentation](https://haystack.deepset.ai/overview/in

 **What**

-Different converters to extract text from your original files (pdf, docx, txt, html).
-While it's almost impossible to cover all types, layouts, and special cases (especially in PDFs), we cover the most common formats (incl. multi-column) and extract meta-information (e.g., page splits). The converters are easily extendable so that you can customize them for your files if needed.
+Different converters to extract text from your original files (pdf, docx, txt, md, html).
+While it's almost impossible to cover all types, layouts, and special cases (especially in PDFs), we cover the most common formats (incl. multi-column) and extract meta-information (e.g., page splits). The converters are easily extendable so that you can customize them for your files if needed. We also provide an OCR based approach for converting images or PDFs. 

 **Available options**

 - Txt
- PDF
+- PDF (incl. OCR)
 - Docx
 - Apache Tika (Supports > 340 file formats)
 - Markdown
+- Images

 **Example**

--- a/haystack/file_converter/init.py
+++ b/haystack/file_converter/init.py
@ -4,3 +4,5 @@ from haystack.file_converter.markdown import MarkdownConverter
 from haystack.file_converter.pdf import PDFToTextConverter
 from haystack.file_converter.tika import TikaConverter
 from haystack.file_converter.txt import TextConverter
+from haystack.file_converter.image import ImageToTextConverter
+from haystack.file_converter.pdf import PDFToTextOCRConverter
--- a/haystack/file_converter/base.py
+++ b/haystack/file_converter/base.py
@ -14,7 +14,11 @@ class BaseConverter(BaseComponent):

    outgoing_edges = 1

-    def __init__(self, remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None):
+    def __init__(
+        self,
+        remove_numeric_tables: bool = False,
+        valid_languages: Optional[List[str]] = None,
+    ):
        """
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
@ -29,7 +33,9 @@ class BaseConverter(BaseComponent):
        """

        # save init parameters to enable export of component config as YAML
-        self.set_config(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
+        self.set_config(
+            remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages
+        )

        self.remove_numeric_tables = remove_numeric_tables
        self.valid_languages = valid_languages
@ -95,12 +101,14 @@ class BaseConverter(BaseComponent):

        documents: list = []
        for file_path, file_meta in zip(file_paths, meta):
-            documents.append(self.convert(
+            documents.append(
+                self.convert(
                    file_path=file_path,
                    meta=file_meta,
                    remove_numeric_tables=remove_numeric_tables,
                    valid_languages=valid_languages,
-            ))
+                )
+            )

        result = {"documents": documents, **kwargs}
        return result, "output_1"
@ -110,6 +118,7 @@ class FileTypeClassifier(BaseComponent):
    """
    Route files in an Indexing Pipeline to corresponding file converters.
    """
+
    outgoing_edges = 5

    def _get_files_extension(self, file_paths: list) -> set:
@ -118,11 +127,11 @@ class FileTypeClassifier(BaseComponent):
        :param file_paths:
        :return: set
        """
-        return {file_path.suffix.lstrip('.') for file_path in file_paths}
+        return {file_path.suffix.lstrip(".") for file_path in file_paths}

    def run(self, file_paths: Union[Path, List[Path]], **kwargs):  # type: ignore
        """
-         Return the output based on file extension
+        Return the output based on file extension
        """
        if isinstance(file_paths, Path):
            file_paths = [file_paths]
--- a/haystack/file_converter/image.py
+++ b/haystack/file_converter/image.py
@ -0,0 +1,152 @@
+import logging
+import subprocess
+from pathlib import Path
+from typing import List, Optional, Dict, Any
+
+import pytesseract
+from PIL.PpmImagePlugin import PpmImageFile
+from PIL import Image
+
+from haystack.file_converter.base import BaseConverter
+
+logger = logging.getLogger(__name__)
+
+
+class ImageToTextConverter(BaseConverter):
+    def __init__(
+        self,
+        remove_numeric_tables: bool = False,
+        valid_languages: Optional[List[str]] = ["eng"],
+    ):
+        """
+        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
+                                      The tabular structures in documents might be noise for the reader model if it
+                                      does not have table parsing capability for finding answers. However, tables
+                                      may also have long strings that could possible candidate for searching answers.
+                                      The rows containing strings are thus retained in this option.
+        :param valid_languages: validate languages from a list of languages specified here
+                                (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html)
+                                This option can be used to add test for encoding errors. If the extracted text is
+                                not one of the valid languages, then it might likely be encoding error resulting
+                                in garbled text. Run the following line of code to check available language packs:
+                                # List of available languages
+                                print(pytesseract.get_languages(config=''))
+        """
+
+        # save init parameters to enable export of component config as YAML
+        self.set_config(
+            remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages
+        )
+
+        verify_installation = subprocess.run(["tesseract -v"], shell=True)
+        if verify_installation.returncode == 127:
+            raise Exception(
+                """tesseract is not installed.
+                
+                   Installation on Linux:
+                   apt-get install tesseract-ocr libtesseract-dev poppler-utils
+                   
+                   Installation on MacOS:
+                   brew install tesseract
+                   
+                   For installing specific language packs check here: https://tesseract-ocr.github.io/tessdoc/Installation.html
+                """
+            )
+        tesseract_langs = []
+        if valid_languages:
+            for language in valid_languages:
+                if (
+                    language in pytesseract.get_languages(config="")
+                    and language not in tesseract_langs
+                ):
+                    tesseract_langs.append(language)
+                else:
+                    raise Exception(
+                        f"""{language} is not either a valid tesseract language code or its language pack isn't installed.
+
+                    Check the list of valid tesseract language codes here: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html
+
+                    For installing specific language packs check here: https://tesseract-ocr.github.io/tessdoc/Installation.html
+                    """
+                    )
+
+        ## if you have more than one language in images, then pass it to tesseract like this e.g., `fra+eng`
+        self.tesseract_langs = "+".join(tesseract_langs)
+        super().__init__(
+            remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages
+        )
+
+    def convert(
+        self,
+        file_path: Path,
+        meta: Optional[Dict[str, str]] = None,
+        remove_numeric_tables: Optional[bool] = None,
+        valid_languages: Optional[List[str]] = None,
+        encoding: Optional[str] = "utf-8",
+    ) -> Dict[str, Any]:
+        """
+        Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
+
+        :param file_path: path to image file
+        :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
+                     Can be any custom keys and values.
+        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
+                                      The tabular structures in documents might be noise for the reader model if it
+                                      does not have table parsing capability for finding answers. However, tables
+                                      may also have long strings that could possible candidate for searching answers.
+                                      The rows containing strings are thus retained in this option.
+        :param valid_languages: validate languages from a list of languages supported by tessarect
+                                (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
+                                This option can be used to add test for encoding errors. If the extracted text is
+                                not one of the valid languages, then it might likely be encoding error resulting
+                                in garbled text.
+        """
+        image = Image.open(file_path)
+        pages = self._image_to_text(image)
+        if remove_numeric_tables is None:
+            remove_numeric_tables = self.remove_numeric_tables
+        if valid_languages is None:
+            valid_languages = self.valid_languages
+
+        cleaned_pages = []
+        for page in pages:
+            lines = page.splitlines()
+            cleaned_lines = []
+            for line in lines:
+                words = line.split()
+                digits = [word for word in words if any(i.isdigit() for i in word)]
+
+                # remove lines having > 40% of words as digits AND not ending with a period(.)
+                if remove_numeric_tables:
+                    if (
+                        words
+                        and len(digits) / len(words) > 0.4
+                        and not line.strip().endswith(".")
+                    ):
+                        logger.debug(f"Removing line '{line}' from file")
+                        continue
+                cleaned_lines.append(line)
+
+            page = "\n".join(cleaned_lines)
+            cleaned_pages.append(page)
+
+        if valid_languages:
+            document_text = "".join(cleaned_pages)
+            if not self.validate_language(document_text):
+                logger.warning(
+                    f"The language for image is not one of {self.valid_languages}. The file may not have "
+                    f"been decoded in the correct text format."
+                )
+
+        text = "\f".join(cleaned_pages)
+        document = {"text": text, "meta": meta}
+        return document
+
+    def _image_to_text(self, image: PpmImageFile) -> List[str]:
+        """
+        Extract text from image file.
+
+        :param image: input image file
+        """
+        text = [pytesseract.image_to_string(image, lang=self.tesseract_langs)]
+        return text
--- a/haystack/file_converter/pdf.py
+++ b/haystack/file_converter/pdf.py
@ -1,15 +1,25 @@
 import logging
 import subprocess
 from pathlib import Path
+import tempfile
+import os
 from typing import List, Optional, Dict, Any

+
+from pdf2image import convert_from_path, convert_from_bytes
+
 from haystack.file_converter.base import BaseConverter
+from haystack.file_converter.image import ImageToTextConverter

 logger = logging.getLogger(__name__)


 class PDFToTextConverter(BaseConverter):
-    def __init__(self, remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None):
+    def __init__(
+        self,
+        remove_numeric_tables: bool = False,
+        valid_languages: Optional[List[str]] = None,
+    ):
        """
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
@ -22,9 +32,10 @@ class PDFToTextConverter(BaseConverter):
                                not one of the valid languages, then it might likely be encoding error resulting
                                in garbled text.
        """
-
        # save init parameters to enable export of component config as YAML
-        self.set_config(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
+        self.set_config(
+            remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages
+        )

        verify_installation = subprocess.run(["pdftotext -v"], shell=True)
        if verify_installation.returncode == 127:
@ -42,7 +53,9 @@ class PDFToTextConverter(BaseConverter):
                """
            )

-        super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
+        super().__init__(
+            remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages
+        )

    def convert(
        self,
@ -102,7 +115,11 @@ class PDFToTextConverter(BaseConverter):

                # remove lines having > 40% of words as digits AND not ending with a period(.)
                if remove_numeric_tables:
-                    if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
+                    if (
+                        words
+                        and len(digits) / len(words) > 0.4
+                        and not line.strip().endswith(".")
+                    ):
                        logger.debug(f"Removing line '{line}' from {file_path}")
                        continue
                cleaned_lines.append(line)
@ -122,7 +139,9 @@ class PDFToTextConverter(BaseConverter):
        document = {"text": text, "meta": meta}
        return document

-    def _read_pdf(self, file_path: Path, layout: bool, encoding: Optional[str] = "Latin1") -> List[str]:
+    def _read_pdf(
+        self, file_path: Path, layout: bool, encoding: Optional[str] = "Latin1"
+    ) -> List[str]:
        """
        Extract pages from the pdf file at file_path.

@ -134,9 +153,85 @@ class PDFToTextConverter(BaseConverter):
            command = ["pdftotext", "-enc", encoding, "-layout", str(file_path), "-"]
        else:
            command = ["pdftotext", "-enc", encoding, str(file_path), "-"]
-        output = subprocess.run(command, stdout=subprocess.PIPE, shell=False) # type: ignore
+        output = subprocess.run(command, stdout=subprocess.PIPE, shell=False)  # type: ignore
        document = output.stdout.decode(errors="ignore")
        pages = document.split("\f")
        pages = pages[:-1]  # the last page in the split is always empty.
        return pages

+
+class PDFToTextOCRConverter(BaseConverter):
+    def __init__(
+        self,
+        remove_numeric_tables: bool = False,
+        valid_languages: Optional[List[str]] = ["eng"],
+    ):
+        """
+        Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
+
+        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
+                                      The tabular structures in documents might be noise for the reader model if it
+                                      does not have table parsing capability for finding answers. However, tables
+                                      may also have long strings that could possible candidate for searching answers.
+                                      The rows containing strings are thus retained in this option.
+        :param valid_languages: validate languages from a list of languages supported by tessarect
+                                (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
+                                This option can be used to add test for encoding errors. If the extracted text is
+                                not one of the valid languages, then it might likely be encoding error resulting
+                                in garbled text.
+        """
+        # init image to text instance
+        self.image_2_text = ImageToTextConverter(remove_numeric_tables, valid_languages)
+
+        # save init parameters to enable export of component config as YAML
+        self.set_config(
+            remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages
+        )
+        super().__init__(
+            remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages
+        )
+
+    def convert(
+        self,
+        file_path: Path,
+        meta: Optional[Dict[str, str]] = None,
+        remove_numeric_tables: Optional[bool] = None,
+        valid_languages: Optional[List[str]] = None,
+        encoding: Optional[str] = "utf-8",
+    ) -> Dict[str, Any]:
+        """
+        Convert a file to a dictionary containing the text and any associated meta data.
+
+        File converters may extract file meta like name or size. In addition to it, user
+        supplied meta data like author, url, external IDs can be supplied as a dictionary.
+
+        :param file_path: path of the file to convert
+        :param meta: dictionary of meta data key-value pairs to append in the returned document.
+        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
+                                      The tabular structures in documents might be noise for the reader model if it
+                                      does not have table parsing capability for finding answers. However, tables
+                                      may also have long strings that could possible candidate for searching answers.
+                                      The rows containing strings are thus retained in this option.
+        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
+                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
+                                This option can be used to add test for encoding errors. If the extracted text is
+                                not one of the valid languages, then it might likely be encoding error resulting
+                                in garbled text.
+        :param encoding: Select the file encoding (default is `utf-8`)
+        """
+        pages = []
+        try:
+            images = convert_from_path(file_path)
+            for image in images:
+                temp_img = tempfile.NamedTemporaryFile(
+                    dir=os.path.dirname(os.path.realpath(__file__)), suffix=".jpeg"
+                )
+                image.save(temp_img.name)
+                pages.append(self.image_2_text.convert(temp_img.name)["text"])
+        except Exception as exception:
+            logger.error(f"File {file_path} has an error \n {exception}")
+
+        raw_text = "\f".join(pages)
+        document = {"text": raw_text, "meta": meta}
+
+        return document
--- a/haystack/file_converter/tika.py
+++ b/haystack/file_converter/tika.py
@ -21,7 +21,9 @@ class TikaXHTMLParser(HTMLParser):

    def handle_starttag(self, tag, attrs):
        # find page div
-        pagediv = [value for attr, value in attrs if attr == "class" and value == "page"]
+        pagediv = [
+            value for attr, value in attrs if attr == "class" and value == "page"
+        ]
        if tag == "div" and pagediv:
            self.ingest = True

@ -43,7 +45,7 @@ class TikaConverter(BaseConverter):
        self,
        tika_url: str = "http://localhost:9998/tika",
        remove_numeric_tables: bool = False,
-        valid_languages: Optional[List[str]] = None
+        valid_languages: Optional[List[str]] = None,
    ):
        """
        :param tika_url: URL of the Tika server
@ -61,15 +63,21 @@ class TikaConverter(BaseConverter):

        # save init parameters to enable export of component config as YAML
        self.set_config(
-            tika_url=tika_url, remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages
+            tika_url=tika_url,
+            remove_numeric_tables=remove_numeric_tables,
+            valid_languages=valid_languages,
        )

        ping = requests.get(tika_url)
        if ping.status_code != 200:
-            raise Exception(f"Apache Tika server is not reachable at the URL '{tika_url}'. To run it locally"
-                            f"with Docker, execute: 'docker run -p 9998:9998 apache/tika:1.24.1'")
+            raise Exception(
+                f"Apache Tika server is not reachable at the URL '{tika_url}'. To run it locally"
+                f"with Docker, execute: 'docker run -p 9998:9998 apache/tika:1.24.1'"
+            )
        self.tika_url = tika_url
-        super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
+        super().__init__(
+            remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages
+        )

    def convert(
        self,
@ -101,7 +109,9 @@ class TikaConverter(BaseConverter):
        if valid_languages is None:
            valid_languages = self.valid_languages

-        parsed = tikaparser.from_file(file_path.as_posix(), self.tika_url, xmlContent=True)
+        parsed = tikaparser.from_file(
+            file_path.as_posix(), self.tika_url, xmlContent=True
+        )
        parser = TikaXHTMLParser()
        parser.feed(parsed["content"])

@ -116,7 +126,11 @@ class TikaConverter(BaseConverter):

                # remove lines having > 40% of words as digits AND not ending with a period(.)
                if remove_numeric_tables:
-                    if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
+                    if (
+                        words
+                        and len(digits) / len(words) > 0.4
+                        and not line.strip().endswith(".")
+                    ):
                        logger.debug(f"Removing line '{line}' from {file_path}")
                        continue

--- a/requirements.txt
+++ b/requirements.txt
@ -11,6 +11,10 @@ elastic-apm
 tox
 coverage
 langdetect # for PDF conversions
+# for PDF conversions using OCR
+pytesseract==0.3.7 
+pillow==8.2.0
+pdf2image==1.14.0
 # optional: sentence-transformers
 python-multipart
 python-docx
--- a/test/test_file_converter.py
+++ b/test/test_file_converter.py
@ -4,12 +4,14 @@ import pytest

 from haystack.file_converter import MarkdownConverter
 from haystack.file_converter.docx import DocxToTextConverter
-from haystack.file_converter.pdf import PDFToTextConverter
+from haystack.file_converter.pdf import PDFToTextConverter, PDFToTextOCRConverter
 from haystack.file_converter.tika import TikaConverter


@pytest.mark.tika
-@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
+@pytest.mark.parametrize(
+    "Converter", [PDFToTextConverter, TikaConverter, PDFToTextOCRConverter]
+)
 def test_convert(Converter, xpdf_fixture):
    converter = Converter()
    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
@ -17,6 +19,13 @@ def test_convert(Converter, xpdf_fixture):
    assert len(pages) == 4  # the sample PDF file has four pages.
    assert pages[0] != ""  # the page 1 of PDF contains text.
    assert pages[2] == ""  # the page 3 of PDF file is empty.
+    # assert text is retained from the document.
+    # As whitespace can differ (\n," ", etc.), we standardize all to simple whitespace
+    page_standard_whitespace = " ".join(pages[0].split())
+    assert (
+        "Adobe Systems made the PDF specification available free of charge in 1993."
+        in page_standard_whitespace
+    )


@pytest.mark.tika
@ -29,22 +38,23 @@ def test_table_removal(Converter, xpdf_fixture):
    assert "324" not in pages[0]
    assert "54x growth" not in pages[0]

-    # assert text is retained from the document.
-    # As whitespace can differ (\n," ", etc.), we standardize all to simple whitespace
-    page_standard_whitespace = " ".join(pages[0].split())
-    assert "Adobe Systems made the PDF specification available free of charge in 1993." in page_standard_whitespace
-

@pytest.mark.tika
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
 def test_language_validation(Converter, xpdf_fixture, caplog):
    converter = Converter(valid_languages=["en"])
    converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
-    assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text
+    assert (
+        "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']."
+        not in caplog.text
+    )

    converter = Converter(valid_languages=["de"])
    converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
-    assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text
+    assert (
+        "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']."
+        in caplog.text
+    )


 def test_docx_converter():