Refactor file converter interface (#393)

2025-11-16 01:54:35 +00:00 · 2020-09-18 10:42:13 +02:00 · 2020-09-18 10:42:13 +02:00 · 3399fc784d
commit 3399fc784d
parent 4e46d9d176
9 changed files with 60 additions and 44 deletions
--- a/haystack/file_converter/base.py
+++ b/haystack/file_converter/base.py
@ -45,7 +45,16 @@ class BaseConverter:
        self.valid_languages = valid_languages

    @abstractmethod
-    def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
+    def convert(self, file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any]:
+        """
+        Convert a file to a dictionary containing the text and any associated meta data.
+
+        File converters may extract file meta like name or size. In addition to it, user
+        supplied meta data like author, url, external IDs can be supplied as a dictionary.
+
+        :param file_path: path of the file to convert
+        :param meta: dictionary of meta data key-value pairs to append in the returned document.
+        """
        pass

    def validate_language(self, text: str) -> bool:
--- a/haystack/file_converter/docx.py
+++ b/haystack/file_converter/docx.py
@ -1,14 +1,16 @@
-from haystack.file_converter.base import BaseConverter
 import logging
 from pathlib import Path
-from typing import List, Dict, Optional, Any, Tuple
+from typing import Dict, Optional, Any
+
 import docx

+from haystack.file_converter.base import BaseConverter
+
 logger = logging.getLogger(__name__)


 class DocxToTextConverter(BaseConverter):
-    def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
+    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
        """
        Extract text from a .docx file.
        Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here.
@ -17,13 +19,8 @@ class DocxToTextConverter(BaseConverter):
        :param file_path: Path to the .docx file you want to convert
        """

-        #TODO We might want to join small passages here (e.g. titles)
-        #TODO Investigate if there's a workaround to extract on a page level rather than passage level
-        #  (e.g. in the test sample it seemed that page breaks resulted in a paragraphs with only a "\n"
-
-        doc = docx.Document(file_path)  # Creating word reader object.
-        fullText = []
-        for para in doc.paragraphs:
-            if para.text.strip() != "":
-                fullText.append(para.text)
-        return fullText, None
+        file = docx.Document(file_path)  # Creating word reader object.
+        paragraphs = [para.text for para in file.paragraphs]
+        text = "".join(paragraphs)
+        document = {"text": text, "meta": meta}
+        return document
--- a/haystack/file_converter/pdf.py
+++ b/haystack/file_converter/pdf.py
@ -2,7 +2,7 @@ import logging
 import re
 import subprocess
 from pathlib import Path
-from typing import List, Optional, Dict, Tuple, Any
+from typing import List, Optional, Dict, Any

 from haystack.file_converter.base import BaseConverter

@ -60,7 +60,7 @@ class PDFToTextConverter(BaseConverter):
            valid_languages=valid_languages,
        )

-    def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
+    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:

        pages = self._read_pdf(file_path, layout=False)

@ -114,7 +114,9 @@ class PDFToTextConverter(BaseConverter):
            )
            logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")

-        return cleaned_pages, None
+        text = "\f".join(cleaned_pages)
+        document = {"text": text, "meta": meta}
+        return document

    def _read_pdf(self, file_path: Path, layout: bool) -> List[str]:
        """
--- a/haystack/file_converter/tika.py
+++ b/haystack/file_converter/tika.py
@ -81,7 +81,7 @@ class TikaConverter(BaseConverter):
            valid_languages=valid_languages,
        )

-    def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
+    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
        """
        :param file_path: Path of file to be converted.

@ -132,4 +132,6 @@ class TikaConverter(BaseConverter):
            )
            logger.info(f"Removed header '{header}' and footer '{footer}' in {file_path}")

-        return cleaned_pages, parsed["metadata"]
+        text = "\f".join(cleaned_pages)
+        document = {"text": text, "meta": {**parsed["metadata"], **(meta or {})}}
+        return document
--- a/haystack/file_converter/txt.py
+++ b/haystack/file_converter/txt.py
@ -1,7 +1,7 @@
 import logging
 import re
 from pathlib import Path
-from typing import List, Optional, Tuple, Any, Dict
+from typing import List, Optional, Any, Dict

 from haystack.file_converter.base import BaseConverter

@ -44,7 +44,7 @@ class TextConverter(BaseConverter):
            valid_languages=valid_languages,
        )

-    def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
+    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
        with open(file_path) as f:
            text = f.read()
            pages = text.split("\f")
@ -89,5 +89,7 @@ class TextConverter(BaseConverter):
            )
            logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")

-        return cleaned_pages, None
+        text = "".join(pages)
+        document = {"text": text, "meta": meta}
+        return document

--- a/haystack/preprocessor/utils.py
+++ b/haystack/preprocessor/utils.py
@ -97,8 +97,8 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None,
            with open(path) as doc:
                text = doc.read()
        elif path.suffix.lower() == ".pdf" and pdf_converter:
-            pages, _ = pdf_converter.extract_pages(path)
-            text = "\n".join(pages)
+            document = pdf_converter.convert(path)
+            text = document["text"]
        else:
            raise Exception(f"Indexing of {path.suffix} files is not currently supported.")

@ -138,10 +138,11 @@ def tika_convert_files_to_dicts(

    documents = []
    for path in file_paths:
-        pages, meta = converter.extract_pages(path)
-        meta = meta or {}
+        document = converter.convert(path)
+        meta = document["meta"] or {}
        meta["name"] = path.name
-        text = ' '.join(pages)
+        text = document["text"]
+        pages = text.split("\f")

        if split_paragraphs:
            if pages:
--- a/rest_api/controller/file_upload.py
+++ b/rest_api/controller/file_upload.py
@ -63,7 +63,7 @@ def upload_file_to_document_store(
                remove_header_footer=remove_header_footer,
                valid_languages=valid_languages,
            )
-            pages = pdf_converter.extract_pages(file_path)
+            document = pdf_converter.convert(file_path)
        elif file.filename.split(".")[-1].lower() == "txt":
            txt_converter = TextConverter(
                remove_numeric_tables=remove_numeric_tables,
@ -72,12 +72,12 @@ def upload_file_to_document_store(
                remove_header_footer=remove_header_footer,
                valid_languages=valid_languages,
            )
-            pages = txt_converter.extract_pages(file_path)
+            document = txt_converter.convert(file_path)
        else:
            raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.")

-        document = {TEXT_FIELD_NAME: "\n".join(pages), "name": file.filename}
-        document_store.write_documents([document])
+        document_to_write = {TEXT_FIELD_NAME: document["text"], "name": file.filename}
+        document_store.write_documents([document_to_write])
        return "File upload was successful."
    finally:
        file.file.close()
--- a/test/test_docx_conversion.py
+++ b/test/test_docx_conversion.py
@ -3,8 +3,7 @@ from pathlib import Path
 from haystack.file_converter.docx import DocxToTextConverter


-def test_extract_pages():
+def test_convert():
    converter = DocxToTextConverter()
-    paragraphs, _ = converter.extract_pages(file_path=Path("samples/docx/sample_docx.docx"))
-    assert len(paragraphs) == 8  # Sample has 8 Paragraphs
-    assert paragraphs[1] == 'The US has "passed the peak" on new coronavirus cases, President Donald Trump said and predicted that some states would reopen this month.'
+    document = converter.convert(file_path=Path("samples/docx/sample_docx.docx"))
+    assert document["text"].startswith("Sample Docx File")
--- a/test/test_pdf_conversion.py
+++ b/test/test_pdf_conversion.py
@ -7,9 +7,10 @@ from haystack.file_converter.tika import TikaConverter


@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
-def test_extract_pages(Converter, xpdf_fixture):
+def test_convert(Converter, xpdf_fixture):
    converter = Converter()
-    pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+    pages = document["text"].split("\f")
    assert len(pages) == 4  # the sample PDF file has four pages.
    assert pages[0] != ""  # the page 1 of PDF contains text.
    assert pages[2] == ""  # the page 3 of PDF file is empty.
@ -18,8 +19,8 @@ def test_extract_pages(Converter, xpdf_fixture):
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
 def test_table_removal(Converter, xpdf_fixture):
    converter = Converter(remove_numeric_tables=True)
-    pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
-
+    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+    pages = document["text"].split("\f")
    # assert numeric rows are removed from the table.
    assert "324" not in pages[0]
    assert "54x growth" not in pages[0]
@ -31,11 +32,11 @@ def test_table_removal(Converter, xpdf_fixture):
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
 def test_language_validation(Converter, xpdf_fixture, caplog):
    converter = Converter(valid_languages=["en"])
-    pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+    converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
    assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text

    converter = Converter(valid_languages=["de"])
-    pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+    converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
    assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text


@ -44,12 +45,15 @@ def test_header_footer_removal(Converter, xpdf_fixture):
    converter = Converter(remove_header_footer=True)
    converter_no_removal = Converter(remove_header_footer=False)

-    pages1, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))  # file contains no header/footer
-    pages2, _ = converter_no_removal.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))  # file contains no header/footer
+    document1 = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))  # file contains no header/footer
+    document2 = converter_no_removal.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))  # file contains no header/footer
+    pages1 = document1["text"].split("\f")
+    pages2 = document2["text"].split("\f")
    for p1, p2 in zip(pages1, pages2):
        assert p2 == p2

-    pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_2.pdf"))  # file contains header and footer
+    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_2.pdf"))  # file contains header and footer
+    pages = document["text"].split("\f")
    assert len(pages) == 4
    for page in pages:
        assert "This is a header." not in page