From 1e4ef24ae90ea36e56a5a68200fa1500d5124cc8 Mon Sep 17 00:00:00 2001
From: Daniel Bichuetti <daniel.bichuetti@gmail.com>
Date: Wed, 22 Feb 2023 04:50:18 -0300
Subject: [PATCH] refactor: isolate PDF converters (#4193)

---
 haystack/nodes/file_converter/__init__.py |   6 +-
 haystack/nodes/file_converter/pdf.py      | 104 +--------------------
 haystack/nodes/file_converter/pdf_ocr.py  | 108 ++++++++++++++++++++++
 3 files changed, 113 insertions(+), 105 deletions(-)
 create mode 100644 haystack/nodes/file_converter/pdf_ocr.py

diff --git a/haystack/nodes/file_converter/__init__.py b/haystack/nodes/file_converter/__init__.py
index 53a83e8bd..1a60448a9 100644
--- a/haystack/nodes/file_converter/__init__.py
+++ b/haystack/nodes/file_converter/__init__.py
@@ -9,6 +9,7 @@ from haystack.nodes.file_converter.tika import TikaConverter, TikaXHTMLParser
 from haystack.nodes.file_converter.txt import TextConverter
 from haystack.nodes.file_converter.azure import AzureConverter
 from haystack.nodes.file_converter.parsr import ParsrConverter
+from haystack.nodes.file_converter.pdf import PDFToTextConverter
 
 MarkdownConverter = safe_import(
     "haystack.nodes.file_converter.markdown", "MarkdownConverter", "preprocessing"
@@ -16,9 +17,6 @@ MarkdownConverter = safe_import(
 ImageToTextConverter = safe_import(
     "haystack.nodes.file_converter.image", "ImageToTextConverter", "ocr"
 )  # Has optional dependencies
-PDFToTextConverter = safe_import(
-    "haystack.nodes.file_converter.pdf", "PDFToTextConverter", "ocr"
-)  # Has optional dependencies
 PDFToTextOCRConverter = safe_import(
-    "haystack.nodes.file_converter.pdf", "PDFToTextOCRConverter", "ocr"
+    "haystack.nodes.file_converter.pdf_ocr", "PDFToTextOCRConverter", "ocr"
 )  # Has optional dependencies
diff --git a/haystack/nodes/file_converter/pdf.py b/haystack/nodes/file_converter/pdf.py
index 6594e4cfb..05c9950fe 100644
--- a/haystack/nodes/file_converter/pdf.py
+++ b/haystack/nodes/file_converter/pdf.py
@@ -1,18 +1,9 @@
 import logging
 import subprocess
-import tempfile
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 
-try:
-    from pdf2image import convert_from_path
-except (ImportError, ModuleNotFoundError) as ie:
-    from haystack.utils.import_utils import _optional_component_not_installed
-
-    _optional_component_not_installed(__name__, "ocr", ie)
-
 from haystack.nodes.file_converter.base import BaseConverter
-from haystack.nodes.file_converter.image import ImageToTextConverter
 from haystack.schema import Document
 
 logger = logging.getLogger(__name__)
@@ -52,7 +43,9 @@ class PDFToTextConverter(BaseConverter):
             remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
         )
         try:
-            subprocess.run(["pdftotext", "-v"], shell=False, check=False)
+            subprocess.run(
+                ["pdftotext", "-v"], shell=False, check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
+            )
         except FileNotFoundError:
             raise FileNotFoundError(
                 """pdftotext is not installed. It is part of xpdf or poppler-utils software suite.
@@ -202,94 +195,3 @@ class PDFToTextConverter(BaseConverter):
         pages = pages[:-1]  # the last page in the split is always empty.
 
         return pages
-
-
-class PDFToTextOCRConverter(BaseConverter):
-    def __init__(
-        self,
-        remove_numeric_tables: bool = False,
-        valid_languages: Optional[List[str]] = None,
-        id_hash_keys: Optional[List[str]] = None,
-    ):
-        """
-        Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
-
-        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
-                                      The tabular structures in documents might be noise for the reader model if it
-                                      does not have table parsing capability for finding answers. However, tables
-                                      may also have long strings that could possible candidate for searching answers.
-                                      The rows containing strings are thus retained in this option.
-        :param valid_languages: validate languages from a list of languages supported by tessarect
-                                (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
-                                This option can be used to add test for encoding errors. If the extracted text is
-                                not one of the valid languages, then it might likely be encoding error resulting
-                                in garbled text. If no value is provided, English will be set as default.
-        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
-            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
-            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
-            In this case the id will be generated by using the content and the defined metadata.
-        """
-        if valid_languages is None:
-            valid_languages = ["eng"]
-        # init image to text instance
-        self.image_2_text = ImageToTextConverter(remove_numeric_tables, valid_languages)
-
-        super().__init__(
-            remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
-        )
-
-    def convert(
-        self,
-        file_path: Path,
-        meta: Optional[Dict[str, Any]] = None,
-        remove_numeric_tables: Optional[bool] = None,
-        valid_languages: Optional[List[str]] = None,
-        encoding: Optional[str] = None,
-        id_hash_keys: Optional[List[str]] = None,
-        start_page: Optional[int] = None,
-        end_page: Optional[int] = None,
-    ) -> List[Document]:
-        """
-        Convert a file to a dictionary containing the text and any associated meta data.
-
-        File converters may extract file meta like name or size. In addition to it, user
-        supplied meta data like author, url, external IDs can be supplied as a dictionary.
-
-        :param file_path: path of the file to convert
-        :param meta: dictionary of meta data key-value pairs to append in the returned document.
-        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
-                                      The tabular structures in documents might be noise for the reader model if it
-                                      does not have table parsing capability for finding answers. However, tables
-                                      may also have long strings that could possible candidate for searching answers.
-                                      The rows containing strings are thus retained in this option.
-        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
-                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
-                                This option can be used to add test for encoding errors. If the extracted text is
-                                not one of the valid languages, then it might likely be encoding error resulting
-                                in garbled text.
-        :param encoding: Not applicable
-        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
-            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
-            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
-            In this case the id will be generated by using the content and the defined metadata.
-        :param start_page: The page number where to start the conversion
-        :param end_page: The page number where to end the conversion.
-        """
-        if id_hash_keys is None:
-            id_hash_keys = self.id_hash_keys
-
-        start_page = start_page or 1
-
-        pages = []
-        try:
-            images = convert_from_path(file_path, first_page=start_page, last_page=end_page)
-            for image in images:
-                temp_img = tempfile.NamedTemporaryFile(suffix=".jpeg")
-                image.save(temp_img.name)
-                pages.append(self.image_2_text.convert(file_path=temp_img.name)[0].content)
-        except Exception as exception:
-            logger.error("File %s has an error:\n%s", file_path, exception)
-
-        raw_text = "\f" * (start_page - 1) + "\f".join(pages)  # tracking skipped pages for correct page numbering
-        document = Document(content=raw_text, meta=meta, id_hash_keys=id_hash_keys)
-        return [document]
diff --git a/haystack/nodes/file_converter/pdf_ocr.py b/haystack/nodes/file_converter/pdf_ocr.py
new file mode 100644
index 000000000..a5ca8918e
--- /dev/null
+++ b/haystack/nodes/file_converter/pdf_ocr.py
@@ -0,0 +1,108 @@
+import logging
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+try:
+    from pdf2image import convert_from_path
+except (ImportError, ModuleNotFoundError) as ie:
+    from haystack.utils.import_utils import _optional_component_not_installed
+
+    _optional_component_not_installed(__name__, "ocr", ie)
+
+from haystack.nodes.file_converter.base import BaseConverter
+from haystack.nodes.file_converter.image import ImageToTextConverter
+from haystack.schema import Document
+
+logger = logging.getLogger(__name__)
+
+
+class PDFToTextOCRConverter(BaseConverter):
+    def __init__(
+        self,
+        remove_numeric_tables: bool = False,
+        valid_languages: Optional[List[str]] = None,
+        id_hash_keys: Optional[List[str]] = None,
+    ):
+        """
+        Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
+
+        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
+                                      The tabular structures in documents might be noise for the reader model if it
+                                      does not have table parsing capability for finding answers. However, tables
+                                      may also have long strings that could possible candidate for searching answers.
+                                      The rows containing strings are thus retained in this option.
+        :param valid_languages: validate languages from a list of languages supported by tessarect
+                                (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
+                                This option can be used to add test for encoding errors. If the extracted text is
+                                not one of the valid languages, then it might likely be encoding error resulting
+                                in garbled text. If no value is provided, English will be set as default.
+        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
+            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+            In this case the id will be generated by using the content and the defined metadata.
+        """
+        if valid_languages is None:
+            valid_languages = ["eng"]
+        # init image to text instance
+        self.image_2_text = ImageToTextConverter(remove_numeric_tables, valid_languages)
+
+        super().__init__(
+            remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
+        )
+
+    def convert(
+        self,
+        file_path: Path,
+        meta: Optional[Dict[str, Any]] = None,
+        remove_numeric_tables: Optional[bool] = None,
+        valid_languages: Optional[List[str]] = None,
+        encoding: Optional[str] = None,
+        id_hash_keys: Optional[List[str]] = None,
+        start_page: Optional[int] = None,
+        end_page: Optional[int] = None,
+    ) -> List[Document]:
+        """
+        Convert a file to a dictionary containing the text and any associated meta data.
+
+        File converters may extract file meta like name or size. In addition to it, user
+        supplied meta data like author, url, external IDs can be supplied as a dictionary.
+
+        :param file_path: path of the file to convert
+        :param meta: dictionary of meta data key-value pairs to append in the returned document.
+        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
+                                      The tabular structures in documents might be noise for the reader model if it
+                                      does not have table parsing capability for finding answers. However, tables
+                                      may also have long strings that could possible candidate for searching answers.
+                                      The rows containing strings are thus retained in this option.
+        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
+                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
+                                This option can be used to add test for encoding errors. If the extracted text is
+                                not one of the valid languages, then it might likely be encoding error resulting
+                                in garbled text.
+        :param encoding: Not applicable
+        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
+            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+            In this case the id will be generated by using the content and the defined metadata.
+        :param start_page: The page number where to start the conversion
+        :param end_page: The page number where to end the conversion.
+        """
+        if id_hash_keys is None:
+            id_hash_keys = self.id_hash_keys
+
+        start_page = start_page or 1
+
+        pages = []
+        try:
+            images = convert_from_path(file_path, first_page=start_page, last_page=end_page)
+            for image in images:
+                temp_img = tempfile.NamedTemporaryFile(suffix=".jpeg")
+                image.save(temp_img.name)
+                pages.append(self.image_2_text.convert(file_path=temp_img.name)[0].content)
+        except Exception as exception:
+            logger.error("File %s has an error:\n%s", file_path, exception)
+
+        raw_text = "\f" * (start_page - 1) + "\f".join(pages)  # tracking skipped pages for correct page numbering
+        document = Document(content=raw_text, meta=meta, id_hash_keys=id_hash_keys)
+        return [document]