From 1e4ef24ae90ea36e56a5a68200fa1500d5124cc8 Mon Sep 17 00:00:00 2001 From: Daniel Bichuetti Date: Wed, 22 Feb 2023 04:50:18 -0300 Subject: [PATCH] refactor: isolate PDF converters (#4193) --- haystack/nodes/file_converter/__init__.py | 6 +- haystack/nodes/file_converter/pdf.py | 104 +-------------------- haystack/nodes/file_converter/pdf_ocr.py | 108 ++++++++++++++++++++++ 3 files changed, 113 insertions(+), 105 deletions(-) create mode 100644 haystack/nodes/file_converter/pdf_ocr.py diff --git a/haystack/nodes/file_converter/__init__.py b/haystack/nodes/file_converter/__init__.py index 53a83e8bd..1a60448a9 100644 --- a/haystack/nodes/file_converter/__init__.py +++ b/haystack/nodes/file_converter/__init__.py @@ -9,6 +9,7 @@ from haystack.nodes.file_converter.tika import TikaConverter, TikaXHTMLParser from haystack.nodes.file_converter.txt import TextConverter from haystack.nodes.file_converter.azure import AzureConverter from haystack.nodes.file_converter.parsr import ParsrConverter +from haystack.nodes.file_converter.pdf import PDFToTextConverter MarkdownConverter = safe_import( "haystack.nodes.file_converter.markdown", "MarkdownConverter", "preprocessing" @@ -16,9 +17,6 @@ MarkdownConverter = safe_import( ImageToTextConverter = safe_import( "haystack.nodes.file_converter.image", "ImageToTextConverter", "ocr" ) # Has optional dependencies -PDFToTextConverter = safe_import( - "haystack.nodes.file_converter.pdf", "PDFToTextConverter", "ocr" -) # Has optional dependencies PDFToTextOCRConverter = safe_import( - "haystack.nodes.file_converter.pdf", "PDFToTextOCRConverter", "ocr" + "haystack.nodes.file_converter.pdf_ocr", "PDFToTextOCRConverter", "ocr" ) # Has optional dependencies diff --git a/haystack/nodes/file_converter/pdf.py b/haystack/nodes/file_converter/pdf.py index 6594e4cfb..05c9950fe 100644 --- a/haystack/nodes/file_converter/pdf.py +++ b/haystack/nodes/file_converter/pdf.py @@ -1,18 +1,9 @@ import logging import subprocess -import tempfile from pathlib import Path from typing import Any, Dict, List, Optional -try: - from pdf2image import convert_from_path -except (ImportError, ModuleNotFoundError) as ie: - from haystack.utils.import_utils import _optional_component_not_installed - - _optional_component_not_installed(__name__, "ocr", ie) - from haystack.nodes.file_converter.base import BaseConverter -from haystack.nodes.file_converter.image import ImageToTextConverter from haystack.schema import Document logger = logging.getLogger(__name__) @@ -52,7 +43,9 @@ class PDFToTextConverter(BaseConverter): remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys ) try: - subprocess.run(["pdftotext", "-v"], shell=False, check=False) + subprocess.run( + ["pdftotext", "-v"], shell=False, check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL + ) except FileNotFoundError: raise FileNotFoundError( """pdftotext is not installed. It is part of xpdf or poppler-utils software suite. @@ -202,94 +195,3 @@ class PDFToTextConverter(BaseConverter): pages = pages[:-1] # the last page in the split is always empty. return pages - - -class PDFToTextOCRConverter(BaseConverter): - def __init__( - self, - remove_numeric_tables: bool = False, - valid_languages: Optional[List[str]] = None, - id_hash_keys: Optional[List[str]] = None, - ): - """ - Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract) - - :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. - The tabular structures in documents might be noise for the reader model if it - does not have table parsing capability for finding answers. However, tables - may also have long strings that could possible candidate for searching answers. - The rows containing strings are thus retained in this option. - :param valid_languages: validate languages from a list of languages supported by tessarect - (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). - This option can be used to add test for encoding errors. If the extracted text is - not one of the valid languages, then it might likely be encoding error resulting - in garbled text. If no value is provided, English will be set as default. - :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's - attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are - not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). - In this case the id will be generated by using the content and the defined metadata. - """ - if valid_languages is None: - valid_languages = ["eng"] - # init image to text instance - self.image_2_text = ImageToTextConverter(remove_numeric_tables, valid_languages) - - super().__init__( - remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys - ) - - def convert( - self, - file_path: Path, - meta: Optional[Dict[str, Any]] = None, - remove_numeric_tables: Optional[bool] = None, - valid_languages: Optional[List[str]] = None, - encoding: Optional[str] = None, - id_hash_keys: Optional[List[str]] = None, - start_page: Optional[int] = None, - end_page: Optional[int] = None, - ) -> List[Document]: - """ - Convert a file to a dictionary containing the text and any associated meta data. - - File converters may extract file meta like name or size. In addition to it, user - supplied meta data like author, url, external IDs can be supplied as a dictionary. - - :param file_path: path of the file to convert - :param meta: dictionary of meta data key-value pairs to append in the returned document. - :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. - The tabular structures in documents might be noise for the reader model if it - does not have table parsing capability for finding answers. However, tables - may also have long strings that could possible candidate for searching answers. - The rows containing strings are thus retained in this option. - :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 - (https://en.wikipedia.org/wiki/ISO_639-1) format. - This option can be used to add test for encoding errors. If the extracted text is - not one of the valid languages, then it might likely be encoding error resulting - in garbled text. - :param encoding: Not applicable - :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's - attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are - not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). - In this case the id will be generated by using the content and the defined metadata. - :param start_page: The page number where to start the conversion - :param end_page: The page number where to end the conversion. - """ - if id_hash_keys is None: - id_hash_keys = self.id_hash_keys - - start_page = start_page or 1 - - pages = [] - try: - images = convert_from_path(file_path, first_page=start_page, last_page=end_page) - for image in images: - temp_img = tempfile.NamedTemporaryFile(suffix=".jpeg") - image.save(temp_img.name) - pages.append(self.image_2_text.convert(file_path=temp_img.name)[0].content) - except Exception as exception: - logger.error("File %s has an error:\n%s", file_path, exception) - - raw_text = "\f" * (start_page - 1) + "\f".join(pages) # tracking skipped pages for correct page numbering - document = Document(content=raw_text, meta=meta, id_hash_keys=id_hash_keys) - return [document] diff --git a/haystack/nodes/file_converter/pdf_ocr.py b/haystack/nodes/file_converter/pdf_ocr.py new file mode 100644 index 000000000..a5ca8918e --- /dev/null +++ b/haystack/nodes/file_converter/pdf_ocr.py @@ -0,0 +1,108 @@ +import logging +import tempfile +from pathlib import Path +from typing import Any, Dict, List, Optional + +try: + from pdf2image import convert_from_path +except (ImportError, ModuleNotFoundError) as ie: + from haystack.utils.import_utils import _optional_component_not_installed + + _optional_component_not_installed(__name__, "ocr", ie) + +from haystack.nodes.file_converter.base import BaseConverter +from haystack.nodes.file_converter.image import ImageToTextConverter +from haystack.schema import Document + +logger = logging.getLogger(__name__) + + +class PDFToTextOCRConverter(BaseConverter): + def __init__( + self, + remove_numeric_tables: bool = False, + valid_languages: Optional[List[str]] = None, + id_hash_keys: Optional[List[str]] = None, + ): + """ + Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract) + + :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for the reader model if it + does not have table parsing capability for finding answers. However, tables + may also have long strings that could possible candidate for searching answers. + The rows containing strings are thus retained in this option. + :param valid_languages: validate languages from a list of languages supported by tessarect + (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. If no value is provided, English will be set as default. + :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's + attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are + not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). + In this case the id will be generated by using the content and the defined metadata. + """ + if valid_languages is None: + valid_languages = ["eng"] + # init image to text instance + self.image_2_text = ImageToTextConverter(remove_numeric_tables, valid_languages) + + super().__init__( + remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys + ) + + def convert( + self, + file_path: Path, + meta: Optional[Dict[str, Any]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = None, + id_hash_keys: Optional[List[str]] = None, + start_page: Optional[int] = None, + end_page: Optional[int] = None, + ) -> List[Document]: + """ + Convert a file to a dictionary containing the text and any associated meta data. + + File converters may extract file meta like name or size. In addition to it, user + supplied meta data like author, url, external IDs can be supplied as a dictionary. + + :param file_path: path of the file to convert + :param meta: dictionary of meta data key-value pairs to append in the returned document. + :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for the reader model if it + does not have table parsing capability for finding answers. However, tables + may also have long strings that could possible candidate for searching answers. + The rows containing strings are thus retained in this option. + :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 + (https://en.wikipedia.org/wiki/ISO_639-1) format. + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. + :param encoding: Not applicable + :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's + attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are + not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). + In this case the id will be generated by using the content and the defined metadata. + :param start_page: The page number where to start the conversion + :param end_page: The page number where to end the conversion. + """ + if id_hash_keys is None: + id_hash_keys = self.id_hash_keys + + start_page = start_page or 1 + + pages = [] + try: + images = convert_from_path(file_path, first_page=start_page, last_page=end_page) + for image in images: + temp_img = tempfile.NamedTemporaryFile(suffix=".jpeg") + image.save(temp_img.name) + pages.append(self.image_2_text.convert(file_path=temp_img.name)[0].content) + except Exception as exception: + logger.error("File %s has an error:\n%s", file_path, exception) + + raw_text = "\f" * (start_page - 1) + "\f".join(pages) # tracking skipped pages for correct page numbering + document = Document(content=raw_text, meta=meta, id_hash_keys=id_hash_keys) + return [document]