From 3399fc784d78b5cf0969a5c46e7954e29bc4fcda Mon Sep 17 00:00:00 2001
From: Tanay Soni <tanaysoni12@gmail.com>
Date: Fri, 18 Sep 2020 10:42:13 +0200
Subject: [PATCH] Refactor file converter interface (#393)

---
 haystack/file_converter/base.py    | 11 ++++++++++-
 haystack/file_converter/docx.py    | 23 ++++++++++-------------
 haystack/file_converter/pdf.py     |  8 +++++---
 haystack/file_converter/tika.py    |  6 ++++--
 haystack/file_converter/txt.py     |  8 +++++---
 haystack/preprocessor/utils.py     | 11 ++++++-----
 rest_api/controller/file_upload.py |  8 ++++----
 test/test_docx_conversion.py       |  7 +++----
 test/test_pdf_conversion.py        | 22 +++++++++++++---------
 9 files changed, 60 insertions(+), 44 deletions(-)

diff --git a/haystack/file_converter/base.py b/haystack/file_converter/base.py
index 483af3c19..aac4e7334 100644
--- a/haystack/file_converter/base.py
+++ b/haystack/file_converter/base.py
@@ -45,7 +45,16 @@ class BaseConverter:
         self.valid_languages = valid_languages
 
     @abstractmethod
-    def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
+    def convert(self, file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any]:
+        """
+        Convert a file to a dictionary containing the text and any associated meta data.
+
+        File converters may extract file meta like name or size. In addition to it, user
+        supplied meta data like author, url, external IDs can be supplied as a dictionary.
+
+        :param file_path: path of the file to convert
+        :param meta: dictionary of meta data key-value pairs to append in the returned document.
+        """
         pass
 
     def validate_language(self, text: str) -> bool:
diff --git a/haystack/file_converter/docx.py b/haystack/file_converter/docx.py
index 43d3f8992..4d1842e2f 100644
--- a/haystack/file_converter/docx.py
+++ b/haystack/file_converter/docx.py
@@ -1,14 +1,16 @@
-from haystack.file_converter.base import BaseConverter
 import logging
 from pathlib import Path
-from typing import List, Dict, Optional, Any, Tuple
+from typing import Dict, Optional, Any
+
 import docx
 
+from haystack.file_converter.base import BaseConverter
+
 logger = logging.getLogger(__name__)
 
 
 class DocxToTextConverter(BaseConverter):
-    def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
+    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
         """
         Extract text from a .docx file.
         Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here.
@@ -17,13 +19,8 @@ class DocxToTextConverter(BaseConverter):
         :param file_path: Path to the .docx file you want to convert
         """
 
-        #TODO We might want to join small passages here (e.g. titles)
-        #TODO Investigate if there's a workaround to extract on a page level rather than passage level
-        #  (e.g. in the test sample it seemed that page breaks resulted in a paragraphs with only a "\n"
-
-        doc = docx.Document(file_path)  # Creating word reader object.
-        fullText = []
-        for para in doc.paragraphs:
-            if para.text.strip() != "":
-                fullText.append(para.text)
-        return fullText, None
+        file = docx.Document(file_path)  # Creating word reader object.
+        paragraphs = [para.text for para in file.paragraphs]
+        text = "".join(paragraphs)
+        document = {"text": text, "meta": meta}
+        return document
diff --git a/haystack/file_converter/pdf.py b/haystack/file_converter/pdf.py
index 8ee4714f8..8c2d6c2e4 100644
--- a/haystack/file_converter/pdf.py
+++ b/haystack/file_converter/pdf.py
@@ -2,7 +2,7 @@ import logging
 import re
 import subprocess
 from pathlib import Path
-from typing import List, Optional, Dict, Tuple, Any
+from typing import List, Optional, Dict, Any
 
 from haystack.file_converter.base import BaseConverter
 
@@ -60,7 +60,7 @@ class PDFToTextConverter(BaseConverter):
             valid_languages=valid_languages,
         )
 
-    def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
+    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
 
         pages = self._read_pdf(file_path, layout=False)
 
@@ -114,7 +114,9 @@ class PDFToTextConverter(BaseConverter):
             )
             logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
 
-        return cleaned_pages, None
+        text = "\f".join(cleaned_pages)
+        document = {"text": text, "meta": meta}
+        return document
 
     def _read_pdf(self, file_path: Path, layout: bool) -> List[str]:
         """
diff --git a/haystack/file_converter/tika.py b/haystack/file_converter/tika.py
index 71b427481..7e68ec810 100644
--- a/haystack/file_converter/tika.py
+++ b/haystack/file_converter/tika.py
@@ -81,7 +81,7 @@ class TikaConverter(BaseConverter):
             valid_languages=valid_languages,
         )
 
-    def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
+    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
         """
         :param file_path: Path of file to be converted.
 
@@ -132,4 +132,6 @@ class TikaConverter(BaseConverter):
             )
             logger.info(f"Removed header '{header}' and footer '{footer}' in {file_path}")
 
-        return cleaned_pages, parsed["metadata"]
+        text = "\f".join(cleaned_pages)
+        document = {"text": text, "meta": {**parsed["metadata"], **(meta or {})}}
+        return document
diff --git a/haystack/file_converter/txt.py b/haystack/file_converter/txt.py
index 6e5f6dece..695627351 100644
--- a/haystack/file_converter/txt.py
+++ b/haystack/file_converter/txt.py
@@ -1,7 +1,7 @@
 import logging
 import re
 from pathlib import Path
-from typing import List, Optional, Tuple, Any, Dict
+from typing import List, Optional, Any, Dict
 
 from haystack.file_converter.base import BaseConverter
 
@@ -44,7 +44,7 @@ class TextConverter(BaseConverter):
             valid_languages=valid_languages,
         )
 
-    def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
+    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
         with open(file_path) as f:
             text = f.read()
             pages = text.split("\f")
@@ -89,5 +89,7 @@ class TextConverter(BaseConverter):
             )
             logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
 
-        return cleaned_pages, None
+        text = "".join(pages)
+        document = {"text": text, "meta": meta}
+        return document
 
diff --git a/haystack/preprocessor/utils.py b/haystack/preprocessor/utils.py
index 323b01cf6..912ee85f3 100644
--- a/haystack/preprocessor/utils.py
+++ b/haystack/preprocessor/utils.py
@@ -97,8 +97,8 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None,
             with open(path) as doc:
                 text = doc.read()
         elif path.suffix.lower() == ".pdf" and pdf_converter:
-            pages, _ = pdf_converter.extract_pages(path)
-            text = "\n".join(pages)
+            document = pdf_converter.convert(path)
+            text = document["text"]
         else:
             raise Exception(f"Indexing of {path.suffix} files is not currently supported.")
 
@@ -138,10 +138,11 @@ def tika_convert_files_to_dicts(
 
     documents = []
     for path in file_paths:
-        pages, meta = converter.extract_pages(path)
-        meta = meta or {}
+        document = converter.convert(path)
+        meta = document["meta"] or {}
         meta["name"] = path.name
-        text = ' '.join(pages)
+        text = document["text"]
+        pages = text.split("\f")
 
         if split_paragraphs:
             if pages:
diff --git a/rest_api/controller/file_upload.py b/rest_api/controller/file_upload.py
index c79bf02a7..a5773832b 100644
--- a/rest_api/controller/file_upload.py
+++ b/rest_api/controller/file_upload.py
@@ -63,7 +63,7 @@ def upload_file_to_document_store(
                 remove_header_footer=remove_header_footer,
                 valid_languages=valid_languages,
             )
-            pages = pdf_converter.extract_pages(file_path)
+            document = pdf_converter.convert(file_path)
         elif file.filename.split(".")[-1].lower() == "txt":
             txt_converter = TextConverter(
                 remove_numeric_tables=remove_numeric_tables,
@@ -72,12 +72,12 @@ def upload_file_to_document_store(
                 remove_header_footer=remove_header_footer,
                 valid_languages=valid_languages,
             )
-            pages = txt_converter.extract_pages(file_path)
+            document = txt_converter.convert(file_path)
         else:
             raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.")
 
-        document = {TEXT_FIELD_NAME: "\n".join(pages), "name": file.filename}
-        document_store.write_documents([document])
+        document_to_write = {TEXT_FIELD_NAME: document["text"], "name": file.filename}
+        document_store.write_documents([document_to_write])
         return "File upload was successful."
     finally:
         file.file.close()
diff --git a/test/test_docx_conversion.py b/test/test_docx_conversion.py
index 88c834c72..92ab482f8 100644
--- a/test/test_docx_conversion.py
+++ b/test/test_docx_conversion.py
@@ -3,8 +3,7 @@ from pathlib import Path
 from haystack.file_converter.docx import DocxToTextConverter
 
 
-def test_extract_pages():
+def test_convert():
     converter = DocxToTextConverter()
-    paragraphs, _ = converter.extract_pages(file_path=Path("samples/docx/sample_docx.docx"))
-    assert len(paragraphs) == 8  # Sample has 8 Paragraphs
-    assert paragraphs[1] == 'The US has "passed the peak" on new coronavirus cases, President Donald Trump said and predicted that some states would reopen this month.'
+    document = converter.convert(file_path=Path("samples/docx/sample_docx.docx"))
+    assert document["text"].startswith("Sample Docx File")
diff --git a/test/test_pdf_conversion.py b/test/test_pdf_conversion.py
index b785bb605..93e06bcc5 100644
--- a/test/test_pdf_conversion.py
+++ b/test/test_pdf_conversion.py
@@ -7,9 +7,10 @@ from haystack.file_converter.tika import TikaConverter
 
 
 @pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
-def test_extract_pages(Converter, xpdf_fixture):
+def test_convert(Converter, xpdf_fixture):
     converter = Converter()
-    pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+    pages = document["text"].split("\f")
     assert len(pages) == 4  # the sample PDF file has four pages.
     assert pages[0] != ""  # the page 1 of PDF contains text.
     assert pages[2] == ""  # the page 3 of PDF file is empty.
@@ -18,8 +19,8 @@ def test_extract_pages(Converter, xpdf_fixture):
 @pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
 def test_table_removal(Converter, xpdf_fixture):
     converter = Converter(remove_numeric_tables=True)
-    pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
-
+    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+    pages = document["text"].split("\f")
     # assert numeric rows are removed from the table.
     assert "324" not in pages[0]
     assert "54x growth" not in pages[0]
@@ -31,11 +32,11 @@ def test_table_removal(Converter, xpdf_fixture):
 @pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
 def test_language_validation(Converter, xpdf_fixture, caplog):
     converter = Converter(valid_languages=["en"])
-    pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+    converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
     assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text
 
     converter = Converter(valid_languages=["de"])
-    pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+    converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
     assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text
 
 
@@ -44,12 +45,15 @@ def test_header_footer_removal(Converter, xpdf_fixture):
     converter = Converter(remove_header_footer=True)
     converter_no_removal = Converter(remove_header_footer=False)
 
-    pages1, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))  # file contains no header/footer
-    pages2, _ = converter_no_removal.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))  # file contains no header/footer
+    document1 = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))  # file contains no header/footer
+    document2 = converter_no_removal.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))  # file contains no header/footer
+    pages1 = document1["text"].split("\f")
+    pages2 = document2["text"].split("\f")
     for p1, p2 in zip(pages1, pages2):
         assert p2 == p2
 
-    pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_2.pdf"))  # file contains header and footer
+    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_2.pdf"))  # file contains header and footer
+    pages = document["text"].split("\f")
     assert len(pages) == 4
     for page in pages:
         assert "This is a header." not in page