diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8092b48da..f1725916f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -91,5 +91,8 @@ jobs: - name: Install pdftotext run: wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.03.tar.gz && tar -xvf xpdf-tools-linux-4.03.tar.gz && sudo cp xpdf-tools-linux-4.03/bin64/pdftotext /usr/local/bin + - name: Install tesseract + run: sudo apt-get install tesseract-ocr libtesseract-dev poppler-utils + - name: Run tests run: cd test && pytest -s ${{ matrix.test-path }} diff --git a/Dockerfile b/Dockerfile index 5091a534e..36525019f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,6 +10,9 @@ RUN wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.03. RUN apt-get install libpoppler-cpp-dev pkg-config -y --fix-missing +# Install Tesseract +RUN apt-get install tesseract-ocr libtesseract-dev poppler-utils + # copy code COPY haystack /home/user/haystack diff --git a/README.md b/README.md index e01913975..a53fc8559 100644 --- a/README.md +++ b/README.md @@ -316,16 +316,17 @@ Please also refer to our [documentation](https://haystack.deepset.ai/overview/in **What** -Different converters to extract text from your original files (pdf, docx, txt, html). -While it's almost impossible to cover all types, layouts, and special cases (especially in PDFs), we cover the most common formats (incl. multi-column) and extract meta-information (e.g., page splits). The converters are easily extendable so that you can customize them for your files if needed. +Different converters to extract text from your original files (pdf, docx, txt, md, html). +While it's almost impossible to cover all types, layouts, and special cases (especially in PDFs), we cover the most common formats (incl. multi-column) and extract meta-information (e.g., page splits). The converters are easily extendable so that you can customize them for your files if needed. We also provide an OCR based approach for converting images or PDFs. **Available options** - Txt -- PDF +- PDF (incl. OCR) - Docx - Apache Tika (Supports > 340 file formats) - Markdown +- Images **Example** diff --git a/haystack/file_converter/__init__.py b/haystack/file_converter/__init__.py index 3f059381d..43511e80f 100644 --- a/haystack/file_converter/__init__.py +++ b/haystack/file_converter/__init__.py @@ -4,3 +4,5 @@ from haystack.file_converter.markdown import MarkdownConverter from haystack.file_converter.pdf import PDFToTextConverter from haystack.file_converter.tika import TikaConverter from haystack.file_converter.txt import TextConverter +from haystack.file_converter.image import ImageToTextConverter +from haystack.file_converter.pdf import PDFToTextOCRConverter diff --git a/haystack/file_converter/base.py b/haystack/file_converter/base.py index ee2fb2961..b37b896d2 100644 --- a/haystack/file_converter/base.py +++ b/haystack/file_converter/base.py @@ -14,7 +14,11 @@ class BaseConverter(BaseComponent): outgoing_edges = 1 - def __init__(self, remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None): + def __init__( + self, + remove_numeric_tables: bool = False, + valid_languages: Optional[List[str]] = None, + ): """ :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. The tabular structures in documents might be noise for the reader model if it @@ -29,7 +33,9 @@ class BaseConverter(BaseComponent): """ # save init parameters to enable export of component config as YAML - self.set_config(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) + self.set_config( + remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages + ) self.remove_numeric_tables = remove_numeric_tables self.valid_languages = valid_languages @@ -95,12 +101,14 @@ class BaseConverter(BaseComponent): documents: list = [] for file_path, file_meta in zip(file_paths, meta): - documents.append(self.convert( + documents.append( + self.convert( file_path=file_path, meta=file_meta, remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, - )) + ) + ) result = {"documents": documents, **kwargs} return result, "output_1" @@ -110,6 +118,7 @@ class FileTypeClassifier(BaseComponent): """ Route files in an Indexing Pipeline to corresponding file converters. """ + outgoing_edges = 5 def _get_files_extension(self, file_paths: list) -> set: @@ -118,11 +127,11 @@ class FileTypeClassifier(BaseComponent): :param file_paths: :return: set """ - return {file_path.suffix.lstrip('.') for file_path in file_paths} + return {file_path.suffix.lstrip(".") for file_path in file_paths} def run(self, file_paths: Union[Path, List[Path]], **kwargs): # type: ignore """ - Return the output based on file extension + Return the output based on file extension """ if isinstance(file_paths, Path): file_paths = [file_paths] diff --git a/haystack/file_converter/image.py b/haystack/file_converter/image.py new file mode 100644 index 000000000..43f2dff83 --- /dev/null +++ b/haystack/file_converter/image.py @@ -0,0 +1,152 @@ +import logging +import subprocess +from pathlib import Path +from typing import List, Optional, Dict, Any + +import pytesseract +from PIL.PpmImagePlugin import PpmImageFile +from PIL import Image + +from haystack.file_converter.base import BaseConverter + +logger = logging.getLogger(__name__) + + +class ImageToTextConverter(BaseConverter): + def __init__( + self, + remove_numeric_tables: bool = False, + valid_languages: Optional[List[str]] = ["eng"], + ): + """ + :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for the reader model if it + does not have table parsing capability for finding answers. However, tables + may also have long strings that could possible candidate for searching answers. + The rows containing strings are thus retained in this option. + :param valid_languages: validate languages from a list of languages specified here + (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html) + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. Run the following line of code to check available language packs: + # List of available languages + print(pytesseract.get_languages(config='')) + """ + + # save init parameters to enable export of component config as YAML + self.set_config( + remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages + ) + + verify_installation = subprocess.run(["tesseract -v"], shell=True) + if verify_installation.returncode == 127: + raise Exception( + """tesseract is not installed. + + Installation on Linux: + apt-get install tesseract-ocr libtesseract-dev poppler-utils + + Installation on MacOS: + brew install tesseract + + For installing specific language packs check here: https://tesseract-ocr.github.io/tessdoc/Installation.html + """ + ) + tesseract_langs = [] + if valid_languages: + for language in valid_languages: + if ( + language in pytesseract.get_languages(config="") + and language not in tesseract_langs + ): + tesseract_langs.append(language) + else: + raise Exception( + f"""{language} is not either a valid tesseract language code or its language pack isn't installed. + + Check the list of valid tesseract language codes here: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html + + For installing specific language packs check here: https://tesseract-ocr.github.io/tessdoc/Installation.html + """ + ) + + ## if you have more than one language in images, then pass it to tesseract like this e.g., `fra+eng` + self.tesseract_langs = "+".join(tesseract_langs) + super().__init__( + remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages + ) + + def convert( + self, + file_path: Path, + meta: Optional[Dict[str, str]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = "utf-8", + ) -> Dict[str, Any]: + """ + Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract) + + :param file_path: path to image file + :param meta: Optional dictionary with metadata that shall be attached to all resulting documents. + Can be any custom keys and values. + :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for the reader model if it + does not have table parsing capability for finding answers. However, tables + may also have long strings that could possible candidate for searching answers. + The rows containing strings are thus retained in this option. + :param valid_languages: validate languages from a list of languages supported by tessarect + (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. + """ + image = Image.open(file_path) + pages = self._image_to_text(image) + if remove_numeric_tables is None: + remove_numeric_tables = self.remove_numeric_tables + if valid_languages is None: + valid_languages = self.valid_languages + + cleaned_pages = [] + for page in pages: + lines = page.splitlines() + cleaned_lines = [] + for line in lines: + words = line.split() + digits = [word for word in words if any(i.isdigit() for i in word)] + + # remove lines having > 40% of words as digits AND not ending with a period(.) + if remove_numeric_tables: + if ( + words + and len(digits) / len(words) > 0.4 + and not line.strip().endswith(".") + ): + logger.debug(f"Removing line '{line}' from file") + continue + cleaned_lines.append(line) + + page = "\n".join(cleaned_lines) + cleaned_pages.append(page) + + if valid_languages: + document_text = "".join(cleaned_pages) + if not self.validate_language(document_text): + logger.warning( + f"The language for image is not one of {self.valid_languages}. The file may not have " + f"been decoded in the correct text format." + ) + + text = "\f".join(cleaned_pages) + document = {"text": text, "meta": meta} + return document + + def _image_to_text(self, image: PpmImageFile) -> List[str]: + """ + Extract text from image file. + + :param image: input image file + """ + text = [pytesseract.image_to_string(image, lang=self.tesseract_langs)] + return text diff --git a/haystack/file_converter/pdf.py b/haystack/file_converter/pdf.py index 17ea44ed3..28eeaaf31 100644 --- a/haystack/file_converter/pdf.py +++ b/haystack/file_converter/pdf.py @@ -1,15 +1,25 @@ import logging import subprocess from pathlib import Path +import tempfile +import os from typing import List, Optional, Dict, Any + +from pdf2image import convert_from_path, convert_from_bytes + from haystack.file_converter.base import BaseConverter +from haystack.file_converter.image import ImageToTextConverter logger = logging.getLogger(__name__) class PDFToTextConverter(BaseConverter): - def __init__(self, remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None): + def __init__( + self, + remove_numeric_tables: bool = False, + valid_languages: Optional[List[str]] = None, + ): """ :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. The tabular structures in documents might be noise for the reader model if it @@ -22,9 +32,10 @@ class PDFToTextConverter(BaseConverter): not one of the valid languages, then it might likely be encoding error resulting in garbled text. """ - # save init parameters to enable export of component config as YAML - self.set_config(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) + self.set_config( + remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages + ) verify_installation = subprocess.run(["pdftotext -v"], shell=True) if verify_installation.returncode == 127: @@ -42,7 +53,9 @@ class PDFToTextConverter(BaseConverter): """ ) - super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) + super().__init__( + remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages + ) def convert( self, @@ -102,7 +115,11 @@ class PDFToTextConverter(BaseConverter): # remove lines having > 40% of words as digits AND not ending with a period(.) if remove_numeric_tables: - if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."): + if ( + words + and len(digits) / len(words) > 0.4 + and not line.strip().endswith(".") + ): logger.debug(f"Removing line '{line}' from {file_path}") continue cleaned_lines.append(line) @@ -122,7 +139,9 @@ class PDFToTextConverter(BaseConverter): document = {"text": text, "meta": meta} return document - def _read_pdf(self, file_path: Path, layout: bool, encoding: Optional[str] = "Latin1") -> List[str]: + def _read_pdf( + self, file_path: Path, layout: bool, encoding: Optional[str] = "Latin1" + ) -> List[str]: """ Extract pages from the pdf file at file_path. @@ -134,9 +153,85 @@ class PDFToTextConverter(BaseConverter): command = ["pdftotext", "-enc", encoding, "-layout", str(file_path), "-"] else: command = ["pdftotext", "-enc", encoding, str(file_path), "-"] - output = subprocess.run(command, stdout=subprocess.PIPE, shell=False) # type: ignore + output = subprocess.run(command, stdout=subprocess.PIPE, shell=False) # type: ignore document = output.stdout.decode(errors="ignore") pages = document.split("\f") pages = pages[:-1] # the last page in the split is always empty. return pages + +class PDFToTextOCRConverter(BaseConverter): + def __init__( + self, + remove_numeric_tables: bool = False, + valid_languages: Optional[List[str]] = ["eng"], + ): + """ + Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract) + + :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for the reader model if it + does not have table parsing capability for finding answers. However, tables + may also have long strings that could possible candidate for searching answers. + The rows containing strings are thus retained in this option. + :param valid_languages: validate languages from a list of languages supported by tessarect + (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. + """ + # init image to text instance + self.image_2_text = ImageToTextConverter(remove_numeric_tables, valid_languages) + + # save init parameters to enable export of component config as YAML + self.set_config( + remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages + ) + super().__init__( + remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages + ) + + def convert( + self, + file_path: Path, + meta: Optional[Dict[str, str]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = "utf-8", + ) -> Dict[str, Any]: + """ + Convert a file to a dictionary containing the text and any associated meta data. + + File converters may extract file meta like name or size. In addition to it, user + supplied meta data like author, url, external IDs can be supplied as a dictionary. + + :param file_path: path of the file to convert + :param meta: dictionary of meta data key-value pairs to append in the returned document. + :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for the reader model if it + does not have table parsing capability for finding answers. However, tables + may also have long strings that could possible candidate for searching answers. + The rows containing strings are thus retained in this option. + :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 + (https://en.wikipedia.org/wiki/ISO_639-1) format. + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. + :param encoding: Select the file encoding (default is `utf-8`) + """ + pages = [] + try: + images = convert_from_path(file_path) + for image in images: + temp_img = tempfile.NamedTemporaryFile( + dir=os.path.dirname(os.path.realpath(__file__)), suffix=".jpeg" + ) + image.save(temp_img.name) + pages.append(self.image_2_text.convert(temp_img.name)["text"]) + except Exception as exception: + logger.error(f"File {file_path} has an error \n {exception}") + + raw_text = "\f".join(pages) + document = {"text": raw_text, "meta": meta} + + return document diff --git a/haystack/file_converter/tika.py b/haystack/file_converter/tika.py index 42ec2e459..1b11f9910 100644 --- a/haystack/file_converter/tika.py +++ b/haystack/file_converter/tika.py @@ -21,7 +21,9 @@ class TikaXHTMLParser(HTMLParser): def handle_starttag(self, tag, attrs): # find page div - pagediv = [value for attr, value in attrs if attr == "class" and value == "page"] + pagediv = [ + value for attr, value in attrs if attr == "class" and value == "page" + ] if tag == "div" and pagediv: self.ingest = True @@ -43,7 +45,7 @@ class TikaConverter(BaseConverter): self, tika_url: str = "http://localhost:9998/tika", remove_numeric_tables: bool = False, - valid_languages: Optional[List[str]] = None + valid_languages: Optional[List[str]] = None, ): """ :param tika_url: URL of the Tika server @@ -61,15 +63,21 @@ class TikaConverter(BaseConverter): # save init parameters to enable export of component config as YAML self.set_config( - tika_url=tika_url, remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages + tika_url=tika_url, + remove_numeric_tables=remove_numeric_tables, + valid_languages=valid_languages, ) ping = requests.get(tika_url) if ping.status_code != 200: - raise Exception(f"Apache Tika server is not reachable at the URL '{tika_url}'. To run it locally" - f"with Docker, execute: 'docker run -p 9998:9998 apache/tika:1.24.1'") + raise Exception( + f"Apache Tika server is not reachable at the URL '{tika_url}'. To run it locally" + f"with Docker, execute: 'docker run -p 9998:9998 apache/tika:1.24.1'" + ) self.tika_url = tika_url - super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) + super().__init__( + remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages + ) def convert( self, @@ -101,7 +109,9 @@ class TikaConverter(BaseConverter): if valid_languages is None: valid_languages = self.valid_languages - parsed = tikaparser.from_file(file_path.as_posix(), self.tika_url, xmlContent=True) + parsed = tikaparser.from_file( + file_path.as_posix(), self.tika_url, xmlContent=True + ) parser = TikaXHTMLParser() parser.feed(parsed["content"]) @@ -116,7 +126,11 @@ class TikaConverter(BaseConverter): # remove lines having > 40% of words as digits AND not ending with a period(.) if remove_numeric_tables: - if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."): + if ( + words + and len(digits) / len(words) > 0.4 + and not line.strip().endswith(".") + ): logger.debug(f"Removing line '{line}' from {file_path}") continue diff --git a/requirements.txt b/requirements.txt index 9ef28121c..996ab8ab5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,10 @@ elastic-apm tox coverage langdetect # for PDF conversions +# for PDF conversions using OCR +pytesseract==0.3.7 +pillow==8.2.0 +pdf2image==1.14.0 # optional: sentence-transformers python-multipart python-docx diff --git a/test/test_file_converter.py b/test/test_file_converter.py index ab2d54502..8749bf494 100644 --- a/test/test_file_converter.py +++ b/test/test_file_converter.py @@ -4,12 +4,14 @@ import pytest from haystack.file_converter import MarkdownConverter from haystack.file_converter.docx import DocxToTextConverter -from haystack.file_converter.pdf import PDFToTextConverter +from haystack.file_converter.pdf import PDFToTextConverter, PDFToTextOCRConverter from haystack.file_converter.tika import TikaConverter @pytest.mark.tika -@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter]) +@pytest.mark.parametrize( + "Converter", [PDFToTextConverter, TikaConverter, PDFToTextOCRConverter] +) def test_convert(Converter, xpdf_fixture): converter = Converter() document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) @@ -17,6 +19,13 @@ def test_convert(Converter, xpdf_fixture): assert len(pages) == 4 # the sample PDF file has four pages. assert pages[0] != "" # the page 1 of PDF contains text. assert pages[2] == "" # the page 3 of PDF file is empty. + # assert text is retained from the document. + # As whitespace can differ (\n," ", etc.), we standardize all to simple whitespace + page_standard_whitespace = " ".join(pages[0].split()) + assert ( + "Adobe Systems made the PDF specification available free of charge in 1993." + in page_standard_whitespace + ) @pytest.mark.tika @@ -29,22 +38,23 @@ def test_table_removal(Converter, xpdf_fixture): assert "324" not in pages[0] assert "54x growth" not in pages[0] - # assert text is retained from the document. - # As whitespace can differ (\n," ", etc.), we standardize all to simple whitespace - page_standard_whitespace = " ".join(pages[0].split()) - assert "Adobe Systems made the PDF specification available free of charge in 1993." in page_standard_whitespace - @pytest.mark.tika @pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter]) def test_language_validation(Converter, xpdf_fixture, caplog): converter = Converter(valid_languages=["en"]) converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) - assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text + assert ( + "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." + not in caplog.text + ) converter = Converter(valid_languages=["de"]) converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) - assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text + assert ( + "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." + in caplog.text + ) def test_docx_converter():