feat: add automatic OCR detection mechanism and improve performance (#4329)

* feat: add automatic OCR detection mechanism and improve performance * refactor: add error message * refactor: ignore pdftoppm bad typing * refactor: add Tesseract install. docstrings * fix: check if OCR var. assigned on mp * tests: add path to windows/linux tests * tests: add tessdata path * tests: include matrix ref. * tests: custom Tesseract matrix install * refactor: improve user guide * tests: fix macos path * tests: remove brew formulae version * fix: macos paths * tests: fix macos path * tests: add Tesseract to Windows Path * tests: pytesseract path * tests: macos path * refactor: fix path message and remove extra path from tests * refactor: raise exception when path not found * refactor: expression simplification Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * refactor: check ocr parameter * tests: mark as integration * tests: mock deprecation warning * refactor: simplify code Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * refactor: change deprecation test Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * refactor: add unit patch * refactor: black formatting --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Mayank Jobanputra <mayankjobanputra@gmail.com>
2025-12-26 22:48:29 +00:00 · 2023-03-13 11:49:22 -03:00 · 2023-03-13 11:49:22 -03:00 · 28724e2e25
commit 28724e2e25
parent fd3f3143d4
4 changed files with 120 additions and 19 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -466,6 +466,7 @@ jobs:
    - name: Run tests
      env:
        TOKENIZERS_PARALLELISM: 'false'
+        TESSDATA_PREFIX: '/usr/share/tesseract-ocr/4.00/tessdata'
      # as confusing as it seems, we skip tests marked as unit here as it means they have been migrated to the right job already
      run: |
        pytest ${{ env.PYTEST_PARAMS }} -m "not elasticsearch and not faiss and not milvus and not weaviate and not pinecone and not integration and not unit" test/${{ matrix.folder }} --document_store_type=memory
@ -506,6 +507,7 @@ jobs:
    - name: Run tests
      env:
        TOKENIZERS_PARALLELISM: 'false'
+        TESSDATA_PREFIX: 'C:\Program Files\Tesseract-OCR\tessdata'
      # as confusing as it seems, we skip tests marked as unit here as it means they have been migrated to the right job already
      run: |
        pytest ${{ env.PYTEST_PARAMS }} -m "not elasticsearch and not faiss and not milvus and not weaviate and not pinecone and not integration and not unit" ${{ env.SUITES_EXCLUDED_FROM_WINDOWS }} test/${{ matrix.folder }} --document_store_type=memory
@ -592,6 +594,7 @@ jobs:
    - name: Run tests
      env:
        TOKENIZERS_PARALLELISM: 'false'  # Avoid logspam by tokenizers
+        TESSDATA_PREFIX: '/usr/share/tesseract-ocr/4.00/tessdata'
      # we add "and not document_store" to exclude the tests that were ported to the new strategy
      run: |
        pytest ${{ env.PYTEST_PARAMS }} -m "integration and not document_store" test/${{ matrix.folder }}
@ -634,6 +637,11 @@ jobs:
        refreshenv
        Get-Service elasticsearch-service-x64 | Start-Service

+    - name: Add Tesseract to system PATH
+      shell: pwsh
+      run: |
+        echo "C:\Program Files\Tesseract-OCR\" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+
    - name: Setup Python
      uses: ./.github/actions/python_cache/
      with:
@ -645,6 +653,7 @@ jobs:
    - name: Run tests
      env:
        TOKENIZERS_PARALLELISM: 'false'  # Avoid logspam by tokenizers
+        TESSDATA_PREFIX: 'C:\Program Files\Tesseract-OCR\tessdata'
      # FIXME many tests are disabled here!
      run: |
        pytest ${{ env.PYTEST_PARAMS }} -m "integration and not tika and not graphdb" ${{ env.SUITES_EXCLUDED_FROM_WINDOWS }} test/${{ matrix.folder }} --document_store_type=memory,faiss,elasticsearch
--- a/haystack/nodes/file_converter/pdf.py
+++ b/haystack/nodes/file_converter/pdf.py
@ -1,9 +1,10 @@
 import logging
+import os
 import warnings
 from concurrent.futures import ProcessPoolExecutor
 from multiprocessing import cpu_count
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union

 from more_itertools import divide

@ -29,6 +30,8 @@ class PDFToTextConverter(BaseConverter):
        encoding: Optional[str] = None,
        keep_physical_layout: Optional[bool] = None,
        sort_by_position: bool = False,
+        ocr: Optional[Literal["auto", "full"]] = None,
+        ocr_language: str = "eng",
        multiprocessing: Union[bool, int] = True,
    ) -> None:
        """
@ -52,6 +55,11 @@ class PDFToTextConverter(BaseConverter):
        :param sort_by_position: Specifies whether to sort the extracted text by positional coordinates or logical reading order.
                        If set to True, the text is sorted first by vertical position, and then by horizontal position.
                        If set to False (default), the logical reading order in the PDF is used.
+        :param ocr: Specifies whether to use OCR to extract text from images in the PDF. If set to "auto", OCR is used only to extract text
+                    from images and integrate into the existing text. If set to "full", OCR is used to extract text from the entire PDF.
+        :param ocr_language: Specifies the language to use for OCR. The default language is English, which language code is `eng`.
+                For a list of supported languages and the respective codes access https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html.
+                You can combine multiple languages by passing a string with the language codes separated by `+`. For example, to use English and German, pass `eng+deu`.
        :param multiprocessing: We use multiprocessing to speed up PyMuPDF conversion, you can disable it by setting it to False.
                                If set to True (the default value), the total number of cores is used. To specify the number of cores to use, set it to an integer.
        """
@ -61,6 +69,13 @@ class PDFToTextConverter(BaseConverter):

        self.sort_by_position = sort_by_position
        self.multiprocessing = multiprocessing
+        self.ocr = ocr
+        self.ocr_language = ocr_language
+
+        if ocr is not None:
+            if ocr not in ["auto", "full"]:
+                raise ValueError("The ocr parameter must be either 'auto' or 'full'.")
+            self._check_tessdata()

        if encoding:
            warnings.warn(
@ -83,6 +98,8 @@ class PDFToTextConverter(BaseConverter):
        end_page: Optional[int] = None,
        keep_physical_layout: Optional[bool] = None,
        sort_by_position: Optional[bool] = None,
+        ocr: Optional[Literal["auto", "full"]] = None,
+        ocr_language: Optional[str] = None,
        multiprocessing: Optional[Union[bool, int]] = None,
    ) -> List[Document]:
        """
@ -112,6 +129,12 @@ class PDFToTextConverter(BaseConverter):
            In this case the id will be generated by using the content and the defined metadata.
        :param start_page: The page number where to start the conversion
        :param end_page: The page number where to end the conversion.
+        :param ocr: Specifies whether to use OCR to extract text from images in the PDF. If set to "auto", OCR is used only to extract text
+                    from images and integrate into the existing text. If set to "full", OCR is used to extract text from the entire PDF.
+                    To use this feature you must install Tesseract-OCR. For more information, see https://github.com/tesseract-ocr/tesseract#installing-tesseract.
+        :param ocr_language: Specifies the language to use for OCR. The default language is English, which language code is `eng`.
+                For a list of supported languages and the respective codes access https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html.
+                You can combine multiple languages by passing a string with the language codes separated by `+`. For example, to use English and German, pass `eng+deu`.
        :param multiprocessing: We use multiprocessing to speed up PyMuPDF conversion, you can disable it by setting it to False.
                                If set to None (the default value), the value defined in the class initialization is used.
                                If set to True, the total number of cores is used. To specify the number of cores to use, set it to an integer.
@ -126,6 +149,10 @@ class PDFToTextConverter(BaseConverter):
            multiprocessing = self.multiprocessing
        if sort_by_position is None:
            sort_by_position = self.sort_by_position
+        if ocr is None:
+            ocr = self.ocr
+        if ocr_language is None:
+            ocr_language = self.ocr_language

        if encoding:
            warnings.warn(
@ -136,11 +163,18 @@ class PDFToTextConverter(BaseConverter):
        if keep_physical_layout:
            warnings.warn("The keep_physical_layout parameter is being deprecated.", DeprecationWarning)

+        if ocr is not None:
+            if ocr not in ["auto", "full"]:
+                raise ValueError("The ocr parameter must be either 'auto' or 'full'.")
+            self._check_tessdata()
+
        pages = self._read_pdf(
            file_path,
            sort_by_position=sort_by_position,
            start_page=start_page,
            end_page=end_page,
+            ocr=ocr,
+            ocr_language=ocr_language,
            multiprocessing=multiprocessing,
        )

@ -176,24 +210,47 @@ class PDFToTextConverter(BaseConverter):
        document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
        return [document]

+    def _check_tessdata(self):
+        if os.getenv("TESSDATA_PREFIX") is None:
+            raise EnvironmentError(
+                """
+                To enable OCR support via PDFToTextConverter, you need to install Tesseract:
+                    - Windows: choco install tesseract-ocr
+                    - Linux (Ubuntu): sudo apt-get install tesseract-ocr
+                    - Mac: brew install tesseract
+                After that, you need to set the environment variable TESSDATA_PREFIX to the path
+                of your Tesseract data directory. Typically this is:
+                    - Windows: C:\\Program Files\\Tesseract-OCR\\tessdata
+                    - Linux (Ubuntu): /usr/share/tesseract-ocr/4.00/tessdata
+                    - Mac (Intel):  /usr/local/Cellar/tesseract/5.3.0_1/share/tessdata
+                    - Mac (M1/M2): /opt/homebrew/Cellar/tesseract/5.3.0_1/share/tessdata
+                """
+            )
+
    def _get_text_parallel(self, page_mp):
-        idx, filename, parts, sort_by_position = page_mp
+        idx, filename, parts, sort_by_position, ocr, ocr_language = page_mp

        doc = fitz.open(filename)

        text = ""
        for i in parts[idx]:
            page = doc[i]
-            text += page.get_text("text", sort=sort_by_position) + "\f"
+            partial_tp = None
+            if ocr is not None:
+                full = ocr == "full"
+                partial_tp = page.get_textpage_ocr(flags=0, full=full, dpi=300, language=ocr_language)
+            text += page.get_text("text", textpage=partial_tp, sort=sort_by_position) + "\f"

        return text

    def _read_pdf(
        self,
        file_path: Path,
+        ocr_language: str,
        sort_by_position: bool = False,
        start_page: Optional[int] = None,
        end_page: Optional[int] = None,
+        ocr: Optional[Literal["auto", "full"]] = None,
        multiprocessing: Optional[Union[bool, int]] = None,
    ) -> List[str]:
        """
@ -227,13 +284,17 @@ class PDFToTextConverter(BaseConverter):
        if not multiprocessing:
            for i in range(start_page, end_page):
                page = doc[i]
-                document += page.get_text("text", sort=sort_by_position) + "\f"
+                partial_tp = None
+                if ocr is not None:
+                    full = ocr == "full"
+                    partial_tp = page.get_textpage_ocr(flags=0, full=full, dpi=300, language=ocr_language)
+                document += page.get_text("text", textpage=partial_tp, sort=sort_by_position) + "\f"
        else:
            cpu = cpu_count() if isinstance(multiprocessing, bool) else multiprocessing
            page_list = [i for i in range(start_page, end_page)]
            cpu = cpu if len(page_list) > cpu else len(page_list)
            parts = divide(cpu, page_list)
-            pages_mp = [(i, file_path, parts, sort_by_position) for i in range(cpu)]
+            pages_mp = [(i, file_path, parts, sort_by_position, ocr, ocr_language) for i in range(cpu)]

            with ProcessPoolExecutor(max_workers=cpu) as pool:
                results = pool.map(self._get_text_parallel, pages_mp)
--- a/haystack/nodes/file_converter/pdf_ocr.py
+++ b/haystack/nodes/file_converter/pdf_ocr.py
@ -1,5 +1,6 @@
 import logging
 import tempfile
+import warnings
 from pathlib import Path
 from typing import Any, Dict, List, Optional

@ -42,6 +43,15 @@ class PDFToTextOCRConverter(BaseConverter):
            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
            In this case the id will be generated by using the content and the defined metadata.
        """
+
+        warnings.warn(
+            """
+            The PDFToTextOCRConverter node is deprecated and will be removed in future versions.
+            Please use the PDFToTextConverter node instead and set the parameter ocr and ocr_language.
+            """,
+            category=DeprecationWarning,
+        )
+
        if valid_languages is None:
            valid_languages = ["eng"]
        # init image to text instance
@ -95,7 +105,7 @@ class PDFToTextOCRConverter(BaseConverter):

        pages = []
        try:
-            images = convert_from_path(file_path, first_page=start_page, last_page=end_page)
+            images = convert_from_path(file_path, first_page=start_page, last_page=end_page)  # type: ignore
            for image in images:
                temp_img = tempfile.NamedTemporaryFile(suffix=".jpeg")
                image.save(temp_img.name)
--- a/test/nodes/test_file_converter.py
+++ b/test/nodes/test_file_converter.py
@ -1,28 +1,29 @@
-from typing import List
-
-import os
-import sys
-from pathlib import Path
-import subprocess
 import csv
 import json
+import os
+import subprocess
+import sys
+import warnings
+from pathlib import Path
+from typing import List
+from unittest.mock import patch

 import pandas as pd
 import pytest

 from haystack import Document
 from haystack.nodes import (
-    MarkdownConverter,
+    AzureConverter,
+    CsvTextConverter,
    DocxToTextConverter,
+    JsonConverter,
+    MarkdownConverter,
+    ParsrConverter,
    PDFToTextConverter,
    PDFToTextOCRConverter,
-    TikaConverter,
-    AzureConverter,
-    ParsrConverter,
-    TextConverter,
-    CsvTextConverter,
-    JsonConverter,
    PreProcessor,
+    TextConverter,
+    TikaConverter,
 )

 from ..conftest import SAMPLES_PATH, fail_at_version
@ -184,6 +185,26 @@ def test_pdf_parallel_sort_by_position(Converter):
    assert pages[-1] == "This is the page 50 of the document."


+@pytest.mark.integration
+@pytest.mark.parametrize("Converter", [PDFToTextConverter])
+def test_pdf_parallel_ocr(Converter):
+    converter = Converter(multiprocessing=True, sort_by_position=True, ocr="full", ocr_language="eng")
+    document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_6.pdf")[0]
+
+    pages = document.content.split("\f")
+
+    assert pages[0] == "This is the page 1 of the document."
+    assert pages[-1] == "This is the page 50 of the document."
+
+
+@pytest.mark.unit
+@fail_at_version(1, 17)
+@patch("haystack.nodes.file_converter.image.ImageToTextConverter.__new__")
+def test_deprecated_ocr_node(mock):
+    with pytest.warns(DeprecationWarning):
+        PDFToTextOCRConverter()
+
+
@fail_at_version(1, 17)
 def test_deprecated_encoding():
    with pytest.warns(DeprecationWarning):