Christian Clauss bf6d306d68
ci: Simplify Python code with ruff rules SIM (#5833)
* ci: Simplify Python code with ruff rules SIM

* Revert #5828

* ruff --select=I --fix haystack/modeling/infer.py

---------

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
2023-09-20 08:32:44 +02:00

171 lines
8.4 KiB
Python

from typing import List, Optional, Dict, Union
import logging
import subprocess
from pathlib import Path
from haystack.nodes.file_converter.base import BaseConverter
from haystack.schema import Document
from haystack.lazy_imports import LazyImport
with LazyImport("Run 'pip install farm-haystack[ocr]'") as ocr_imports:
import pytesseract
from PIL.PpmImagePlugin import PpmImageFile
from PIL import Image
logger = logging.getLogger(__name__)
class ImageToTextConverter(BaseConverter):
def __init__(
self,
remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None,
):
"""
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param valid_languages: validate languages from a list of languages specified here
(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html)
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text. If no value is provided, English will be set as default.
Run the following line of code to check available language packs:
`# List of available languages
print(pytesseract.get_languages(config=''))`
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
"""
ocr_imports.check()
if valid_languages is None:
valid_languages = ["eng"]
super().__init__(
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
)
verify_installation = subprocess.run(["tesseract -v"], shell=True)
if verify_installation.returncode == 127:
raise Exception(
"""tesseract is not installed.
Installation on Linux:
apt-get install tesseract-ocr libtesseract-dev poppler-utils
Installation on MacOS:
brew install tesseract
For installing specific language packs check here: https://tesseract-ocr.github.io/tessdoc/Installation.html
"""
)
tesseract_langs = []
if valid_languages:
for language in valid_languages:
if language in pytesseract.get_languages(config="") and language not in tesseract_langs:
tesseract_langs.append(language)
else:
raise Exception(
f"""{language} is not either a valid tesseract language code or its language pack isn't installed.
Check the list of valid tesseract language codes here: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html
For installing specific language packs check here: https://tesseract-ocr.github.io/tessdoc/Installation.html
"""
)
## if you have more than one language in images, then pass it to tesseract like this e.g., `fra+eng`
self.tesseract_langs = "+".join(tesseract_langs)
super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
def convert(
self,
file_path: Union[Path, str],
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None,
) -> List[Document]:
"""
Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
:param file_path: path to image file
:param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
Can be any custom keys and values.
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param valid_languages: validate languages from a list of languages supported by tessarect
(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
:param encoding: Not applicable
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
"""
if id_hash_keys is None:
id_hash_keys = self.id_hash_keys
file_path = Path(file_path)
image = Image.open(file_path)
pages = self._image_to_text(image)
if remove_numeric_tables is None:
remove_numeric_tables = self.remove_numeric_tables
if valid_languages is None:
valid_languages = self.valid_languages
cleaned_pages = []
for page in pages:
lines = page.splitlines()
cleaned_lines = []
for line in lines:
words = line.split()
digits = [word for word in words if any(i.isdigit() for i in word)]
# remove lines having > 40% of words as digits AND not ending with a period(.)
if (
remove_numeric_tables
and words
and len(digits) / len(words) > 0.4
and not line.strip().endswith(".")
):
logger.debug("Removing line '%s' from file", line)
continue
cleaned_lines.append(line)
page = "\n".join(cleaned_lines)
cleaned_pages.append(page)
if valid_languages:
document_text = "".join(cleaned_pages)
if not self.validate_language(document_text, valid_languages):
logger.warning(
"The language for image is not one of %s. The file may not have "
"been decoded in the correct text format.",
valid_languages,
)
text = "\f".join(cleaned_pages)
document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
return [document]
def _image_to_text(self, image: "PpmImageFile") -> List[str]:
"""
Extract text from image file.
:param image: input image file
"""
text = [pytesseract.image_to_string(image, lang=self.tesseract_langs)]
return text