Add ImageToTextConverter and PDFToTextOCRConverter that utilize OCR (#1349)

* add image.py converter

* add PDFtoImageConverter

* add init to PDFtoImageConverter and classes to __init__

* update imagetotext pipeline

* update imagetotext pipeline

* update imagetotext pipeline

* update imagetotext pipeline

* update imagetotext pipeline

* update imagetotext pipeline

* update imagetotext pipeline

* revert change in base.py in file_conv

* Update base.py

* Update pdf.py

* add ocr file_converter testcase & update dockerfile

* fix tesseract exception message typo

* fix _image_to_text doctstring

* add tesseract installation to CI

* add tesseract installation to CI

* add content test for PDF OCR converter

* update PDFToTextOCRConverter constructor doctsring

* replace image files with tmp paths for image.py convert

* replace image files with tmp paths for image.py convert

* Update README.md

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
This commit is contained in:
Shahrukh Khan 2021-09-01 16:42:25 +02:00 committed by GitHub
parent 1d2252e96d
commit 4822536886
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 326 additions and 33 deletions

View File

@ -91,5 +91,8 @@ jobs:
- name: Install pdftotext
run: wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.03.tar.gz && tar -xvf xpdf-tools-linux-4.03.tar.gz && sudo cp xpdf-tools-linux-4.03/bin64/pdftotext /usr/local/bin
- name: Install tesseract
run: sudo apt-get install tesseract-ocr libtesseract-dev poppler-utils
- name: Run tests
run: cd test && pytest -s ${{ matrix.test-path }}

View File

@ -10,6 +10,9 @@ RUN wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.03.
RUN apt-get install libpoppler-cpp-dev pkg-config -y --fix-missing
# Install Tesseract
RUN apt-get install tesseract-ocr libtesseract-dev poppler-utils
# copy code
COPY haystack /home/user/haystack

View File

@ -316,16 +316,17 @@ Please also refer to our [documentation](https://haystack.deepset.ai/overview/in
**What**
Different converters to extract text from your original files (pdf, docx, txt, html).
While it's almost impossible to cover all types, layouts, and special cases (especially in PDFs), we cover the most common formats (incl. multi-column) and extract meta-information (e.g., page splits). The converters are easily extendable so that you can customize them for your files if needed.
Different converters to extract text from your original files (pdf, docx, txt, md, html).
While it's almost impossible to cover all types, layouts, and special cases (especially in PDFs), we cover the most common formats (incl. multi-column) and extract meta-information (e.g., page splits). The converters are easily extendable so that you can customize them for your files if needed. We also provide an OCR based approach for converting images or PDFs.
**Available options**
- Txt
- PDF
- PDF (incl. OCR)
- Docx
- Apache Tika (Supports > 340 file formats)
- Markdown
- Images
**Example**

View File

@ -4,3 +4,5 @@ from haystack.file_converter.markdown import MarkdownConverter
from haystack.file_converter.pdf import PDFToTextConverter
from haystack.file_converter.tika import TikaConverter
from haystack.file_converter.txt import TextConverter
from haystack.file_converter.image import ImageToTextConverter
from haystack.file_converter.pdf import PDFToTextOCRConverter

View File

@ -14,7 +14,11 @@ class BaseConverter(BaseComponent):
outgoing_edges = 1
def __init__(self, remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None):
def __init__(
self,
remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = None,
):
"""
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
@ -29,7 +33,9 @@ class BaseConverter(BaseComponent):
"""
# save init parameters to enable export of component config as YAML
self.set_config(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
self.set_config(
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages
)
self.remove_numeric_tables = remove_numeric_tables
self.valid_languages = valid_languages
@ -95,12 +101,14 @@ class BaseConverter(BaseComponent):
documents: list = []
for file_path, file_meta in zip(file_paths, meta):
documents.append(self.convert(
documents.append(
self.convert(
file_path=file_path,
meta=file_meta,
remove_numeric_tables=remove_numeric_tables,
valid_languages=valid_languages,
))
)
)
result = {"documents": documents, **kwargs}
return result, "output_1"
@ -110,6 +118,7 @@ class FileTypeClassifier(BaseComponent):
"""
Route files in an Indexing Pipeline to corresponding file converters.
"""
outgoing_edges = 5
def _get_files_extension(self, file_paths: list) -> set:
@ -118,11 +127,11 @@ class FileTypeClassifier(BaseComponent):
:param file_paths:
:return: set
"""
return {file_path.suffix.lstrip('.') for file_path in file_paths}
return {file_path.suffix.lstrip(".") for file_path in file_paths}
def run(self, file_paths: Union[Path, List[Path]], **kwargs): # type: ignore
"""
Return the output based on file extension
Return the output based on file extension
"""
if isinstance(file_paths, Path):
file_paths = [file_paths]

View File

@ -0,0 +1,152 @@
import logging
import subprocess
from pathlib import Path
from typing import List, Optional, Dict, Any
import pytesseract
from PIL.PpmImagePlugin import PpmImageFile
from PIL import Image
from haystack.file_converter.base import BaseConverter
logger = logging.getLogger(__name__)
class ImageToTextConverter(BaseConverter):
def __init__(
self,
remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = ["eng"],
):
"""
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param valid_languages: validate languages from a list of languages specified here
(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html)
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text. Run the following line of code to check available language packs:
# List of available languages
print(pytesseract.get_languages(config=''))
"""
# save init parameters to enable export of component config as YAML
self.set_config(
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages
)
verify_installation = subprocess.run(["tesseract -v"], shell=True)
if verify_installation.returncode == 127:
raise Exception(
"""tesseract is not installed.
Installation on Linux:
apt-get install tesseract-ocr libtesseract-dev poppler-utils
Installation on MacOS:
brew install tesseract
For installing specific language packs check here: https://tesseract-ocr.github.io/tessdoc/Installation.html
"""
)
tesseract_langs = []
if valid_languages:
for language in valid_languages:
if (
language in pytesseract.get_languages(config="")
and language not in tesseract_langs
):
tesseract_langs.append(language)
else:
raise Exception(
f"""{language} is not either a valid tesseract language code or its language pack isn't installed.
Check the list of valid tesseract language codes here: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html
For installing specific language packs check here: https://tesseract-ocr.github.io/tessdoc/Installation.html
"""
)
## if you have more than one language in images, then pass it to tesseract like this e.g., `fra+eng`
self.tesseract_langs = "+".join(tesseract_langs)
super().__init__(
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages
)
def convert(
self,
file_path: Path,
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "utf-8",
) -> Dict[str, Any]:
"""
Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
:param file_path: path to image file
:param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
Can be any custom keys and values.
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param valid_languages: validate languages from a list of languages supported by tessarect
(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
"""
image = Image.open(file_path)
pages = self._image_to_text(image)
if remove_numeric_tables is None:
remove_numeric_tables = self.remove_numeric_tables
if valid_languages is None:
valid_languages = self.valid_languages
cleaned_pages = []
for page in pages:
lines = page.splitlines()
cleaned_lines = []
for line in lines:
words = line.split()
digits = [word for word in words if any(i.isdigit() for i in word)]
# remove lines having > 40% of words as digits AND not ending with a period(.)
if remove_numeric_tables:
if (
words
and len(digits) / len(words) > 0.4
and not line.strip().endswith(".")
):
logger.debug(f"Removing line '{line}' from file")
continue
cleaned_lines.append(line)
page = "\n".join(cleaned_lines)
cleaned_pages.append(page)
if valid_languages:
document_text = "".join(cleaned_pages)
if not self.validate_language(document_text):
logger.warning(
f"The language for image is not one of {self.valid_languages}. The file may not have "
f"been decoded in the correct text format."
)
text = "\f".join(cleaned_pages)
document = {"text": text, "meta": meta}
return document
def _image_to_text(self, image: PpmImageFile) -> List[str]:
"""
Extract text from image file.
:param image: input image file
"""
text = [pytesseract.image_to_string(image, lang=self.tesseract_langs)]
return text

View File

@ -1,15 +1,25 @@
import logging
import subprocess
from pathlib import Path
import tempfile
import os
from typing import List, Optional, Dict, Any
from pdf2image import convert_from_path, convert_from_bytes
from haystack.file_converter.base import BaseConverter
from haystack.file_converter.image import ImageToTextConverter
logger = logging.getLogger(__name__)
class PDFToTextConverter(BaseConverter):
def __init__(self, remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None):
def __init__(
self,
remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = None,
):
"""
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
@ -22,9 +32,10 @@ class PDFToTextConverter(BaseConverter):
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
"""
# save init parameters to enable export of component config as YAML
self.set_config(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
self.set_config(
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages
)
verify_installation = subprocess.run(["pdftotext -v"], shell=True)
if verify_installation.returncode == 127:
@ -42,7 +53,9 @@ class PDFToTextConverter(BaseConverter):
"""
)
super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
super().__init__(
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages
)
def convert(
self,
@ -102,7 +115,11 @@ class PDFToTextConverter(BaseConverter):
# remove lines having > 40% of words as digits AND not ending with a period(.)
if remove_numeric_tables:
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
if (
words
and len(digits) / len(words) > 0.4
and not line.strip().endswith(".")
):
logger.debug(f"Removing line '{line}' from {file_path}")
continue
cleaned_lines.append(line)
@ -122,7 +139,9 @@ class PDFToTextConverter(BaseConverter):
document = {"text": text, "meta": meta}
return document
def _read_pdf(self, file_path: Path, layout: bool, encoding: Optional[str] = "Latin1") -> List[str]:
def _read_pdf(
self, file_path: Path, layout: bool, encoding: Optional[str] = "Latin1"
) -> List[str]:
"""
Extract pages from the pdf file at file_path.
@ -134,9 +153,85 @@ class PDFToTextConverter(BaseConverter):
command = ["pdftotext", "-enc", encoding, "-layout", str(file_path), "-"]
else:
command = ["pdftotext", "-enc", encoding, str(file_path), "-"]
output = subprocess.run(command, stdout=subprocess.PIPE, shell=False) # type: ignore
output = subprocess.run(command, stdout=subprocess.PIPE, shell=False) # type: ignore
document = output.stdout.decode(errors="ignore")
pages = document.split("\f")
pages = pages[:-1] # the last page in the split is always empty.
return pages
class PDFToTextOCRConverter(BaseConverter):
def __init__(
self,
remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = ["eng"],
):
"""
Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param valid_languages: validate languages from a list of languages supported by tessarect
(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
"""
# init image to text instance
self.image_2_text = ImageToTextConverter(remove_numeric_tables, valid_languages)
# save init parameters to enable export of component config as YAML
self.set_config(
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages
)
super().__init__(
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages
)
def convert(
self,
file_path: Path,
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "utf-8",
) -> Dict[str, Any]:
"""
Convert a file to a dictionary containing the text and any associated meta data.
File converters may extract file meta like name or size. In addition to it, user
supplied meta data like author, url, external IDs can be supplied as a dictionary.
:param file_path: path of the file to convert
:param meta: dictionary of meta data key-value pairs to append in the returned document.
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
:param encoding: Select the file encoding (default is `utf-8`)
"""
pages = []
try:
images = convert_from_path(file_path)
for image in images:
temp_img = tempfile.NamedTemporaryFile(
dir=os.path.dirname(os.path.realpath(__file__)), suffix=".jpeg"
)
image.save(temp_img.name)
pages.append(self.image_2_text.convert(temp_img.name)["text"])
except Exception as exception:
logger.error(f"File {file_path} has an error \n {exception}")
raw_text = "\f".join(pages)
document = {"text": raw_text, "meta": meta}
return document

View File

@ -21,7 +21,9 @@ class TikaXHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
# find page div
pagediv = [value for attr, value in attrs if attr == "class" and value == "page"]
pagediv = [
value for attr, value in attrs if attr == "class" and value == "page"
]
if tag == "div" and pagediv:
self.ingest = True
@ -43,7 +45,7 @@ class TikaConverter(BaseConverter):
self,
tika_url: str = "http://localhost:9998/tika",
remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = None
valid_languages: Optional[List[str]] = None,
):
"""
:param tika_url: URL of the Tika server
@ -61,15 +63,21 @@ class TikaConverter(BaseConverter):
# save init parameters to enable export of component config as YAML
self.set_config(
tika_url=tika_url, remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages
tika_url=tika_url,
remove_numeric_tables=remove_numeric_tables,
valid_languages=valid_languages,
)
ping = requests.get(tika_url)
if ping.status_code != 200:
raise Exception(f"Apache Tika server is not reachable at the URL '{tika_url}'. To run it locally"
f"with Docker, execute: 'docker run -p 9998:9998 apache/tika:1.24.1'")
raise Exception(
f"Apache Tika server is not reachable at the URL '{tika_url}'. To run it locally"
f"with Docker, execute: 'docker run -p 9998:9998 apache/tika:1.24.1'"
)
self.tika_url = tika_url
super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
super().__init__(
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages
)
def convert(
self,
@ -101,7 +109,9 @@ class TikaConverter(BaseConverter):
if valid_languages is None:
valid_languages = self.valid_languages
parsed = tikaparser.from_file(file_path.as_posix(), self.tika_url, xmlContent=True)
parsed = tikaparser.from_file(
file_path.as_posix(), self.tika_url, xmlContent=True
)
parser = TikaXHTMLParser()
parser.feed(parsed["content"])
@ -116,7 +126,11 @@ class TikaConverter(BaseConverter):
# remove lines having > 40% of words as digits AND not ending with a period(.)
if remove_numeric_tables:
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
if (
words
and len(digits) / len(words) > 0.4
and not line.strip().endswith(".")
):
logger.debug(f"Removing line '{line}' from {file_path}")
continue

View File

@ -11,6 +11,10 @@ elastic-apm
tox
coverage
langdetect # for PDF conversions
# for PDF conversions using OCR
pytesseract==0.3.7
pillow==8.2.0
pdf2image==1.14.0
# optional: sentence-transformers
python-multipart
python-docx

View File

@ -4,12 +4,14 @@ import pytest
from haystack.file_converter import MarkdownConverter
from haystack.file_converter.docx import DocxToTextConverter
from haystack.file_converter.pdf import PDFToTextConverter
from haystack.file_converter.pdf import PDFToTextConverter, PDFToTextOCRConverter
from haystack.file_converter.tika import TikaConverter
@pytest.mark.tika
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
@pytest.mark.parametrize(
"Converter", [PDFToTextConverter, TikaConverter, PDFToTextOCRConverter]
)
def test_convert(Converter, xpdf_fixture):
converter = Converter()
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
@ -17,6 +19,13 @@ def test_convert(Converter, xpdf_fixture):
assert len(pages) == 4 # the sample PDF file has four pages.
assert pages[0] != "" # the page 1 of PDF contains text.
assert pages[2] == "" # the page 3 of PDF file is empty.
# assert text is retained from the document.
# As whitespace can differ (\n," ", etc.), we standardize all to simple whitespace
page_standard_whitespace = " ".join(pages[0].split())
assert (
"Adobe Systems made the PDF specification available free of charge in 1993."
in page_standard_whitespace
)
@pytest.mark.tika
@ -29,22 +38,23 @@ def test_table_removal(Converter, xpdf_fixture):
assert "324" not in pages[0]
assert "54x growth" not in pages[0]
# assert text is retained from the document.
# As whitespace can differ (\n," ", etc.), we standardize all to simple whitespace
page_standard_whitespace = " ".join(pages[0].split())
assert "Adobe Systems made the PDF specification available free of charge in 1993." in page_standard_whitespace
@pytest.mark.tika
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
def test_language_validation(Converter, xpdf_fixture, caplog):
converter = Converter(valid_languages=["en"])
converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text
assert (
"The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']."
not in caplog.text
)
converter = Converter(valid_languages=["de"])
converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text
assert (
"The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']."
in caplog.text
)
def test_docx_converter():