feat: add automatic OCR detection mechanism and improve performance (#4329)

* feat: add automatic OCR detection mechanism and improve performance

* refactor: add error message

* refactor: ignore pdftoppm bad typing

* refactor: add Tesseract install. docstrings

* fix: check if OCR var. assigned on mp

* tests: add path to windows/linux tests

* tests: add tessdata path

* tests: include matrix ref.

* tests: custom Tesseract matrix install

* refactor: improve user guide

* tests: fix macos path

* tests: remove brew formulae version

* fix: macos paths

* tests: fix macos path

* tests: add Tesseract to Windows Path

* tests: pytesseract path

* tests: macos path

* refactor: fix path message and remove extra path from tests

* refactor: raise exception when path not found

* refactor: expression simplification

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>

* refactor: check ocr parameter

* tests: mark as integration

* tests: mock deprecation warning

* refactor: simplify code

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>

* refactor: change deprecation test

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>

* refactor: add unit patch

* refactor: black formatting

---------

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
Co-authored-by: Mayank Jobanputra <mayankjobanputra@gmail.com>
This commit is contained in:
Daniel Bichuetti 2023-03-13 11:49:22 -03:00 committed by GitHub
parent fd3f3143d4
commit 28724e2e25
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 120 additions and 19 deletions

View File

@ -466,6 +466,7 @@ jobs:
- name: Run tests
env:
TOKENIZERS_PARALLELISM: 'false'
TESSDATA_PREFIX: '/usr/share/tesseract-ocr/4.00/tessdata'
# as confusing as it seems, we skip tests marked as unit here as it means they have been migrated to the right job already
run: |
pytest ${{ env.PYTEST_PARAMS }} -m "not elasticsearch and not faiss and not milvus and not weaviate and not pinecone and not integration and not unit" test/${{ matrix.folder }} --document_store_type=memory
@ -506,6 +507,7 @@ jobs:
- name: Run tests
env:
TOKENIZERS_PARALLELISM: 'false'
TESSDATA_PREFIX: 'C:\Program Files\Tesseract-OCR\tessdata'
# as confusing as it seems, we skip tests marked as unit here as it means they have been migrated to the right job already
run: |
pytest ${{ env.PYTEST_PARAMS }} -m "not elasticsearch and not faiss and not milvus and not weaviate and not pinecone and not integration and not unit" ${{ env.SUITES_EXCLUDED_FROM_WINDOWS }} test/${{ matrix.folder }} --document_store_type=memory
@ -592,6 +594,7 @@ jobs:
- name: Run tests
env:
TOKENIZERS_PARALLELISM: 'false' # Avoid logspam by tokenizers
TESSDATA_PREFIX: '/usr/share/tesseract-ocr/4.00/tessdata'
# we add "and not document_store" to exclude the tests that were ported to the new strategy
run: |
pytest ${{ env.PYTEST_PARAMS }} -m "integration and not document_store" test/${{ matrix.folder }}
@ -634,6 +637,11 @@ jobs:
refreshenv
Get-Service elasticsearch-service-x64 | Start-Service
- name: Add Tesseract to system PATH
shell: pwsh
run: |
echo "C:\Program Files\Tesseract-OCR\" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
- name: Setup Python
uses: ./.github/actions/python_cache/
with:
@ -645,6 +653,7 @@ jobs:
- name: Run tests
env:
TOKENIZERS_PARALLELISM: 'false' # Avoid logspam by tokenizers
TESSDATA_PREFIX: 'C:\Program Files\Tesseract-OCR\tessdata'
# FIXME many tests are disabled here!
run: |
pytest ${{ env.PYTEST_PARAMS }} -m "integration and not tika and not graphdb" ${{ env.SUITES_EXCLUDED_FROM_WINDOWS }} test/${{ matrix.folder }} --document_store_type=memory,faiss,elasticsearch

View File

@ -1,9 +1,10 @@
import logging
import os
import warnings
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import cpu_count
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Literal, Optional, Union
from more_itertools import divide
@ -29,6 +30,8 @@ class PDFToTextConverter(BaseConverter):
encoding: Optional[str] = None,
keep_physical_layout: Optional[bool] = None,
sort_by_position: bool = False,
ocr: Optional[Literal["auto", "full"]] = None,
ocr_language: str = "eng",
multiprocessing: Union[bool, int] = True,
) -> None:
"""
@ -52,6 +55,11 @@ class PDFToTextConverter(BaseConverter):
:param sort_by_position: Specifies whether to sort the extracted text by positional coordinates or logical reading order.
If set to True, the text is sorted first by vertical position, and then by horizontal position.
If set to False (default), the logical reading order in the PDF is used.
:param ocr: Specifies whether to use OCR to extract text from images in the PDF. If set to "auto", OCR is used only to extract text
from images and integrate into the existing text. If set to "full", OCR is used to extract text from the entire PDF.
:param ocr_language: Specifies the language to use for OCR. The default language is English, which language code is `eng`.
For a list of supported languages and the respective codes access https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html.
You can combine multiple languages by passing a string with the language codes separated by `+`. For example, to use English and German, pass `eng+deu`.
:param multiprocessing: We use multiprocessing to speed up PyMuPDF conversion, you can disable it by setting it to False.
If set to True (the default value), the total number of cores is used. To specify the number of cores to use, set it to an integer.
"""
@ -61,6 +69,13 @@ class PDFToTextConverter(BaseConverter):
self.sort_by_position = sort_by_position
self.multiprocessing = multiprocessing
self.ocr = ocr
self.ocr_language = ocr_language
if ocr is not None:
if ocr not in ["auto", "full"]:
raise ValueError("The ocr parameter must be either 'auto' or 'full'.")
self._check_tessdata()
if encoding:
warnings.warn(
@ -83,6 +98,8 @@ class PDFToTextConverter(BaseConverter):
end_page: Optional[int] = None,
keep_physical_layout: Optional[bool] = None,
sort_by_position: Optional[bool] = None,
ocr: Optional[Literal["auto", "full"]] = None,
ocr_language: Optional[str] = None,
multiprocessing: Optional[Union[bool, int]] = None,
) -> List[Document]:
"""
@ -112,6 +129,12 @@ class PDFToTextConverter(BaseConverter):
In this case the id will be generated by using the content and the defined metadata.
:param start_page: The page number where to start the conversion
:param end_page: The page number where to end the conversion.
:param ocr: Specifies whether to use OCR to extract text from images in the PDF. If set to "auto", OCR is used only to extract text
from images and integrate into the existing text. If set to "full", OCR is used to extract text from the entire PDF.
To use this feature you must install Tesseract-OCR. For more information, see https://github.com/tesseract-ocr/tesseract#installing-tesseract.
:param ocr_language: Specifies the language to use for OCR. The default language is English, which language code is `eng`.
For a list of supported languages and the respective codes access https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html.
You can combine multiple languages by passing a string with the language codes separated by `+`. For example, to use English and German, pass `eng+deu`.
:param multiprocessing: We use multiprocessing to speed up PyMuPDF conversion, you can disable it by setting it to False.
If set to None (the default value), the value defined in the class initialization is used.
If set to True, the total number of cores is used. To specify the number of cores to use, set it to an integer.
@ -126,6 +149,10 @@ class PDFToTextConverter(BaseConverter):
multiprocessing = self.multiprocessing
if sort_by_position is None:
sort_by_position = self.sort_by_position
if ocr is None:
ocr = self.ocr
if ocr_language is None:
ocr_language = self.ocr_language
if encoding:
warnings.warn(
@ -136,11 +163,18 @@ class PDFToTextConverter(BaseConverter):
if keep_physical_layout:
warnings.warn("The keep_physical_layout parameter is being deprecated.", DeprecationWarning)
if ocr is not None:
if ocr not in ["auto", "full"]:
raise ValueError("The ocr parameter must be either 'auto' or 'full'.")
self._check_tessdata()
pages = self._read_pdf(
file_path,
sort_by_position=sort_by_position,
start_page=start_page,
end_page=end_page,
ocr=ocr,
ocr_language=ocr_language,
multiprocessing=multiprocessing,
)
@ -176,24 +210,47 @@ class PDFToTextConverter(BaseConverter):
document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
return [document]
def _check_tessdata(self):
if os.getenv("TESSDATA_PREFIX") is None:
raise EnvironmentError(
"""
To enable OCR support via PDFToTextConverter, you need to install Tesseract:
- Windows: choco install tesseract-ocr
- Linux (Ubuntu): sudo apt-get install tesseract-ocr
- Mac: brew install tesseract
After that, you need to set the environment variable TESSDATA_PREFIX to the path
of your Tesseract data directory. Typically this is:
- Windows: C:\\Program Files\\Tesseract-OCR\\tessdata
- Linux (Ubuntu): /usr/share/tesseract-ocr/4.00/tessdata
- Mac (Intel): /usr/local/Cellar/tesseract/5.3.0_1/share/tessdata
- Mac (M1/M2): /opt/homebrew/Cellar/tesseract/5.3.0_1/share/tessdata
"""
)
def _get_text_parallel(self, page_mp):
idx, filename, parts, sort_by_position = page_mp
idx, filename, parts, sort_by_position, ocr, ocr_language = page_mp
doc = fitz.open(filename)
text = ""
for i in parts[idx]:
page = doc[i]
text += page.get_text("text", sort=sort_by_position) + "\f"
partial_tp = None
if ocr is not None:
full = ocr == "full"
partial_tp = page.get_textpage_ocr(flags=0, full=full, dpi=300, language=ocr_language)
text += page.get_text("text", textpage=partial_tp, sort=sort_by_position) + "\f"
return text
def _read_pdf(
self,
file_path: Path,
ocr_language: str,
sort_by_position: bool = False,
start_page: Optional[int] = None,
end_page: Optional[int] = None,
ocr: Optional[Literal["auto", "full"]] = None,
multiprocessing: Optional[Union[bool, int]] = None,
) -> List[str]:
"""
@ -227,13 +284,17 @@ class PDFToTextConverter(BaseConverter):
if not multiprocessing:
for i in range(start_page, end_page):
page = doc[i]
document += page.get_text("text", sort=sort_by_position) + "\f"
partial_tp = None
if ocr is not None:
full = ocr == "full"
partial_tp = page.get_textpage_ocr(flags=0, full=full, dpi=300, language=ocr_language)
document += page.get_text("text", textpage=partial_tp, sort=sort_by_position) + "\f"
else:
cpu = cpu_count() if isinstance(multiprocessing, bool) else multiprocessing
page_list = [i for i in range(start_page, end_page)]
cpu = cpu if len(page_list) > cpu else len(page_list)
parts = divide(cpu, page_list)
pages_mp = [(i, file_path, parts, sort_by_position) for i in range(cpu)]
pages_mp = [(i, file_path, parts, sort_by_position, ocr, ocr_language) for i in range(cpu)]
with ProcessPoolExecutor(max_workers=cpu) as pool:
results = pool.map(self._get_text_parallel, pages_mp)

View File

@ -1,5 +1,6 @@
import logging
import tempfile
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional
@ -42,6 +43,15 @@ class PDFToTextOCRConverter(BaseConverter):
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
"""
warnings.warn(
"""
The PDFToTextOCRConverter node is deprecated and will be removed in future versions.
Please use the PDFToTextConverter node instead and set the parameter ocr and ocr_language.
""",
category=DeprecationWarning,
)
if valid_languages is None:
valid_languages = ["eng"]
# init image to text instance
@ -95,7 +105,7 @@ class PDFToTextOCRConverter(BaseConverter):
pages = []
try:
images = convert_from_path(file_path, first_page=start_page, last_page=end_page)
images = convert_from_path(file_path, first_page=start_page, last_page=end_page) # type: ignore
for image in images:
temp_img = tempfile.NamedTemporaryFile(suffix=".jpeg")
image.save(temp_img.name)

View File

@ -1,28 +1,29 @@
from typing import List
import os
import sys
from pathlib import Path
import subprocess
import csv
import json
import os
import subprocess
import sys
import warnings
from pathlib import Path
from typing import List
from unittest.mock import patch
import pandas as pd
import pytest
from haystack import Document
from haystack.nodes import (
MarkdownConverter,
AzureConverter,
CsvTextConverter,
DocxToTextConverter,
JsonConverter,
MarkdownConverter,
ParsrConverter,
PDFToTextConverter,
PDFToTextOCRConverter,
TikaConverter,
AzureConverter,
ParsrConverter,
TextConverter,
CsvTextConverter,
JsonConverter,
PreProcessor,
TextConverter,
TikaConverter,
)
from ..conftest import SAMPLES_PATH, fail_at_version
@ -184,6 +185,26 @@ def test_pdf_parallel_sort_by_position(Converter):
assert pages[-1] == "This is the page 50 of the document."
@pytest.mark.integration
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_parallel_ocr(Converter):
converter = Converter(multiprocessing=True, sort_by_position=True, ocr="full", ocr_language="eng")
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_6.pdf")[0]
pages = document.content.split("\f")
assert pages[0] == "This is the page 1 of the document."
assert pages[-1] == "This is the page 50 of the document."
@pytest.mark.unit
@fail_at_version(1, 17)
@patch("haystack.nodes.file_converter.image.ImageToTextConverter.__new__")
def test_deprecated_ocr_node(mock):
with pytest.warns(DeprecationWarning):
PDFToTextOCRConverter()
@fail_at_version(1, 17)
def test_deprecated_encoding():
with pytest.warns(DeprecationWarning):