mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-26 22:48:29 +00:00
feat: add automatic OCR detection mechanism and improve performance (#4329)
* feat: add automatic OCR detection mechanism and improve performance * refactor: add error message * refactor: ignore pdftoppm bad typing * refactor: add Tesseract install. docstrings * fix: check if OCR var. assigned on mp * tests: add path to windows/linux tests * tests: add tessdata path * tests: include matrix ref. * tests: custom Tesseract matrix install * refactor: improve user guide * tests: fix macos path * tests: remove brew formulae version * fix: macos paths * tests: fix macos path * tests: add Tesseract to Windows Path * tests: pytesseract path * tests: macos path * refactor: fix path message and remove extra path from tests * refactor: raise exception when path not found * refactor: expression simplification Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * refactor: check ocr parameter * tests: mark as integration * tests: mock deprecation warning * refactor: simplify code Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * refactor: change deprecation test Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * refactor: add unit patch * refactor: black formatting --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Mayank Jobanputra <mayankjobanputra@gmail.com>
This commit is contained in:
parent
fd3f3143d4
commit
28724e2e25
9
.github/workflows/tests.yml
vendored
9
.github/workflows/tests.yml
vendored
@ -466,6 +466,7 @@ jobs:
|
||||
- name: Run tests
|
||||
env:
|
||||
TOKENIZERS_PARALLELISM: 'false'
|
||||
TESSDATA_PREFIX: '/usr/share/tesseract-ocr/4.00/tessdata'
|
||||
# as confusing as it seems, we skip tests marked as unit here as it means they have been migrated to the right job already
|
||||
run: |
|
||||
pytest ${{ env.PYTEST_PARAMS }} -m "not elasticsearch and not faiss and not milvus and not weaviate and not pinecone and not integration and not unit" test/${{ matrix.folder }} --document_store_type=memory
|
||||
@ -506,6 +507,7 @@ jobs:
|
||||
- name: Run tests
|
||||
env:
|
||||
TOKENIZERS_PARALLELISM: 'false'
|
||||
TESSDATA_PREFIX: 'C:\Program Files\Tesseract-OCR\tessdata'
|
||||
# as confusing as it seems, we skip tests marked as unit here as it means they have been migrated to the right job already
|
||||
run: |
|
||||
pytest ${{ env.PYTEST_PARAMS }} -m "not elasticsearch and not faiss and not milvus and not weaviate and not pinecone and not integration and not unit" ${{ env.SUITES_EXCLUDED_FROM_WINDOWS }} test/${{ matrix.folder }} --document_store_type=memory
|
||||
@ -592,6 +594,7 @@ jobs:
|
||||
- name: Run tests
|
||||
env:
|
||||
TOKENIZERS_PARALLELISM: 'false' # Avoid logspam by tokenizers
|
||||
TESSDATA_PREFIX: '/usr/share/tesseract-ocr/4.00/tessdata'
|
||||
# we add "and not document_store" to exclude the tests that were ported to the new strategy
|
||||
run: |
|
||||
pytest ${{ env.PYTEST_PARAMS }} -m "integration and not document_store" test/${{ matrix.folder }}
|
||||
@ -634,6 +637,11 @@ jobs:
|
||||
refreshenv
|
||||
Get-Service elasticsearch-service-x64 | Start-Service
|
||||
|
||||
- name: Add Tesseract to system PATH
|
||||
shell: pwsh
|
||||
run: |
|
||||
echo "C:\Program Files\Tesseract-OCR\" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
|
||||
- name: Setup Python
|
||||
uses: ./.github/actions/python_cache/
|
||||
with:
|
||||
@ -645,6 +653,7 @@ jobs:
|
||||
- name: Run tests
|
||||
env:
|
||||
TOKENIZERS_PARALLELISM: 'false' # Avoid logspam by tokenizers
|
||||
TESSDATA_PREFIX: 'C:\Program Files\Tesseract-OCR\tessdata'
|
||||
# FIXME many tests are disabled here!
|
||||
run: |
|
||||
pytest ${{ env.PYTEST_PARAMS }} -m "integration and not tika and not graphdb" ${{ env.SUITES_EXCLUDED_FROM_WINDOWS }} test/${{ matrix.folder }} --document_store_type=memory,faiss,elasticsearch
|
||||
|
||||
@ -1,9 +1,10 @@
|
||||
import logging
|
||||
import os
|
||||
import warnings
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from multiprocessing import cpu_count
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from typing import Any, Dict, List, Literal, Optional, Union
|
||||
|
||||
from more_itertools import divide
|
||||
|
||||
@ -29,6 +30,8 @@ class PDFToTextConverter(BaseConverter):
|
||||
encoding: Optional[str] = None,
|
||||
keep_physical_layout: Optional[bool] = None,
|
||||
sort_by_position: bool = False,
|
||||
ocr: Optional[Literal["auto", "full"]] = None,
|
||||
ocr_language: str = "eng",
|
||||
multiprocessing: Union[bool, int] = True,
|
||||
) -> None:
|
||||
"""
|
||||
@ -52,6 +55,11 @@ class PDFToTextConverter(BaseConverter):
|
||||
:param sort_by_position: Specifies whether to sort the extracted text by positional coordinates or logical reading order.
|
||||
If set to True, the text is sorted first by vertical position, and then by horizontal position.
|
||||
If set to False (default), the logical reading order in the PDF is used.
|
||||
:param ocr: Specifies whether to use OCR to extract text from images in the PDF. If set to "auto", OCR is used only to extract text
|
||||
from images and integrate into the existing text. If set to "full", OCR is used to extract text from the entire PDF.
|
||||
:param ocr_language: Specifies the language to use for OCR. The default language is English, which language code is `eng`.
|
||||
For a list of supported languages and the respective codes access https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html.
|
||||
You can combine multiple languages by passing a string with the language codes separated by `+`. For example, to use English and German, pass `eng+deu`.
|
||||
:param multiprocessing: We use multiprocessing to speed up PyMuPDF conversion, you can disable it by setting it to False.
|
||||
If set to True (the default value), the total number of cores is used. To specify the number of cores to use, set it to an integer.
|
||||
"""
|
||||
@ -61,6 +69,13 @@ class PDFToTextConverter(BaseConverter):
|
||||
|
||||
self.sort_by_position = sort_by_position
|
||||
self.multiprocessing = multiprocessing
|
||||
self.ocr = ocr
|
||||
self.ocr_language = ocr_language
|
||||
|
||||
if ocr is not None:
|
||||
if ocr not in ["auto", "full"]:
|
||||
raise ValueError("The ocr parameter must be either 'auto' or 'full'.")
|
||||
self._check_tessdata()
|
||||
|
||||
if encoding:
|
||||
warnings.warn(
|
||||
@ -83,6 +98,8 @@ class PDFToTextConverter(BaseConverter):
|
||||
end_page: Optional[int] = None,
|
||||
keep_physical_layout: Optional[bool] = None,
|
||||
sort_by_position: Optional[bool] = None,
|
||||
ocr: Optional[Literal["auto", "full"]] = None,
|
||||
ocr_language: Optional[str] = None,
|
||||
multiprocessing: Optional[Union[bool, int]] = None,
|
||||
) -> List[Document]:
|
||||
"""
|
||||
@ -112,6 +129,12 @@ class PDFToTextConverter(BaseConverter):
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
:param start_page: The page number where to start the conversion
|
||||
:param end_page: The page number where to end the conversion.
|
||||
:param ocr: Specifies whether to use OCR to extract text from images in the PDF. If set to "auto", OCR is used only to extract text
|
||||
from images and integrate into the existing text. If set to "full", OCR is used to extract text from the entire PDF.
|
||||
To use this feature you must install Tesseract-OCR. For more information, see https://github.com/tesseract-ocr/tesseract#installing-tesseract.
|
||||
:param ocr_language: Specifies the language to use for OCR. The default language is English, which language code is `eng`.
|
||||
For a list of supported languages and the respective codes access https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html.
|
||||
You can combine multiple languages by passing a string with the language codes separated by `+`. For example, to use English and German, pass `eng+deu`.
|
||||
:param multiprocessing: We use multiprocessing to speed up PyMuPDF conversion, you can disable it by setting it to False.
|
||||
If set to None (the default value), the value defined in the class initialization is used.
|
||||
If set to True, the total number of cores is used. To specify the number of cores to use, set it to an integer.
|
||||
@ -126,6 +149,10 @@ class PDFToTextConverter(BaseConverter):
|
||||
multiprocessing = self.multiprocessing
|
||||
if sort_by_position is None:
|
||||
sort_by_position = self.sort_by_position
|
||||
if ocr is None:
|
||||
ocr = self.ocr
|
||||
if ocr_language is None:
|
||||
ocr_language = self.ocr_language
|
||||
|
||||
if encoding:
|
||||
warnings.warn(
|
||||
@ -136,11 +163,18 @@ class PDFToTextConverter(BaseConverter):
|
||||
if keep_physical_layout:
|
||||
warnings.warn("The keep_physical_layout parameter is being deprecated.", DeprecationWarning)
|
||||
|
||||
if ocr is not None:
|
||||
if ocr not in ["auto", "full"]:
|
||||
raise ValueError("The ocr parameter must be either 'auto' or 'full'.")
|
||||
self._check_tessdata()
|
||||
|
||||
pages = self._read_pdf(
|
||||
file_path,
|
||||
sort_by_position=sort_by_position,
|
||||
start_page=start_page,
|
||||
end_page=end_page,
|
||||
ocr=ocr,
|
||||
ocr_language=ocr_language,
|
||||
multiprocessing=multiprocessing,
|
||||
)
|
||||
|
||||
@ -176,24 +210,47 @@ class PDFToTextConverter(BaseConverter):
|
||||
document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
|
||||
return [document]
|
||||
|
||||
def _check_tessdata(self):
|
||||
if os.getenv("TESSDATA_PREFIX") is None:
|
||||
raise EnvironmentError(
|
||||
"""
|
||||
To enable OCR support via PDFToTextConverter, you need to install Tesseract:
|
||||
- Windows: choco install tesseract-ocr
|
||||
- Linux (Ubuntu): sudo apt-get install tesseract-ocr
|
||||
- Mac: brew install tesseract
|
||||
After that, you need to set the environment variable TESSDATA_PREFIX to the path
|
||||
of your Tesseract data directory. Typically this is:
|
||||
- Windows: C:\\Program Files\\Tesseract-OCR\\tessdata
|
||||
- Linux (Ubuntu): /usr/share/tesseract-ocr/4.00/tessdata
|
||||
- Mac (Intel): /usr/local/Cellar/tesseract/5.3.0_1/share/tessdata
|
||||
- Mac (M1/M2): /opt/homebrew/Cellar/tesseract/5.3.0_1/share/tessdata
|
||||
"""
|
||||
)
|
||||
|
||||
def _get_text_parallel(self, page_mp):
|
||||
idx, filename, parts, sort_by_position = page_mp
|
||||
idx, filename, parts, sort_by_position, ocr, ocr_language = page_mp
|
||||
|
||||
doc = fitz.open(filename)
|
||||
|
||||
text = ""
|
||||
for i in parts[idx]:
|
||||
page = doc[i]
|
||||
text += page.get_text("text", sort=sort_by_position) + "\f"
|
||||
partial_tp = None
|
||||
if ocr is not None:
|
||||
full = ocr == "full"
|
||||
partial_tp = page.get_textpage_ocr(flags=0, full=full, dpi=300, language=ocr_language)
|
||||
text += page.get_text("text", textpage=partial_tp, sort=sort_by_position) + "\f"
|
||||
|
||||
return text
|
||||
|
||||
def _read_pdf(
|
||||
self,
|
||||
file_path: Path,
|
||||
ocr_language: str,
|
||||
sort_by_position: bool = False,
|
||||
start_page: Optional[int] = None,
|
||||
end_page: Optional[int] = None,
|
||||
ocr: Optional[Literal["auto", "full"]] = None,
|
||||
multiprocessing: Optional[Union[bool, int]] = None,
|
||||
) -> List[str]:
|
||||
"""
|
||||
@ -227,13 +284,17 @@ class PDFToTextConverter(BaseConverter):
|
||||
if not multiprocessing:
|
||||
for i in range(start_page, end_page):
|
||||
page = doc[i]
|
||||
document += page.get_text("text", sort=sort_by_position) + "\f"
|
||||
partial_tp = None
|
||||
if ocr is not None:
|
||||
full = ocr == "full"
|
||||
partial_tp = page.get_textpage_ocr(flags=0, full=full, dpi=300, language=ocr_language)
|
||||
document += page.get_text("text", textpage=partial_tp, sort=sort_by_position) + "\f"
|
||||
else:
|
||||
cpu = cpu_count() if isinstance(multiprocessing, bool) else multiprocessing
|
||||
page_list = [i for i in range(start_page, end_page)]
|
||||
cpu = cpu if len(page_list) > cpu else len(page_list)
|
||||
parts = divide(cpu, page_list)
|
||||
pages_mp = [(i, file_path, parts, sort_by_position) for i in range(cpu)]
|
||||
pages_mp = [(i, file_path, parts, sort_by_position, ocr, ocr_language) for i in range(cpu)]
|
||||
|
||||
with ProcessPoolExecutor(max_workers=cpu) as pool:
|
||||
results = pool.map(self._get_text_parallel, pages_mp)
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import logging
|
||||
import tempfile
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
@ -42,6 +43,15 @@ class PDFToTextOCRConverter(BaseConverter):
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
"""
|
||||
|
||||
warnings.warn(
|
||||
"""
|
||||
The PDFToTextOCRConverter node is deprecated and will be removed in future versions.
|
||||
Please use the PDFToTextConverter node instead and set the parameter ocr and ocr_language.
|
||||
""",
|
||||
category=DeprecationWarning,
|
||||
)
|
||||
|
||||
if valid_languages is None:
|
||||
valid_languages = ["eng"]
|
||||
# init image to text instance
|
||||
@ -95,7 +105,7 @@ class PDFToTextOCRConverter(BaseConverter):
|
||||
|
||||
pages = []
|
||||
try:
|
||||
images = convert_from_path(file_path, first_page=start_page, last_page=end_page)
|
||||
images = convert_from_path(file_path, first_page=start_page, last_page=end_page) # type: ignore
|
||||
for image in images:
|
||||
temp_img = tempfile.NamedTemporaryFile(suffix=".jpeg")
|
||||
image.save(temp_img.name)
|
||||
|
||||
@ -1,28 +1,29 @@
|
||||
from typing import List
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import subprocess
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from unittest.mock import patch
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from haystack import Document
|
||||
from haystack.nodes import (
|
||||
MarkdownConverter,
|
||||
AzureConverter,
|
||||
CsvTextConverter,
|
||||
DocxToTextConverter,
|
||||
JsonConverter,
|
||||
MarkdownConverter,
|
||||
ParsrConverter,
|
||||
PDFToTextConverter,
|
||||
PDFToTextOCRConverter,
|
||||
TikaConverter,
|
||||
AzureConverter,
|
||||
ParsrConverter,
|
||||
TextConverter,
|
||||
CsvTextConverter,
|
||||
JsonConverter,
|
||||
PreProcessor,
|
||||
TextConverter,
|
||||
TikaConverter,
|
||||
)
|
||||
|
||||
from ..conftest import SAMPLES_PATH, fail_at_version
|
||||
@ -184,6 +185,26 @@ def test_pdf_parallel_sort_by_position(Converter):
|
||||
assert pages[-1] == "This is the page 50 of the document."
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
@pytest.mark.parametrize("Converter", [PDFToTextConverter])
|
||||
def test_pdf_parallel_ocr(Converter):
|
||||
converter = Converter(multiprocessing=True, sort_by_position=True, ocr="full", ocr_language="eng")
|
||||
document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_6.pdf")[0]
|
||||
|
||||
pages = document.content.split("\f")
|
||||
|
||||
assert pages[0] == "This is the page 1 of the document."
|
||||
assert pages[-1] == "This is the page 50 of the document."
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
@fail_at_version(1, 17)
|
||||
@patch("haystack.nodes.file_converter.image.ImageToTextConverter.__new__")
|
||||
def test_deprecated_ocr_node(mock):
|
||||
with pytest.warns(DeprecationWarning):
|
||||
PDFToTextOCRConverter()
|
||||
|
||||
|
||||
@fail_at_version(1, 17)
|
||||
def test_deprecated_encoding():
|
||||
with pytest.warns(DeprecationWarning):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user