Refactor file converter interface (#393)

2025-10-21 04:48:59 +00:00 · 2020-09-18 10:42:13 +02:00 · 2020-09-18 10:42:13 +02:00 · 3399fc784d
commit 3399fc784d
parent 4e46d9d176
9 changed files with 60 additions and 44 deletions
--- a/haystack/file_converter/base.py
+++ b/haystack/file_converter/base.py
@ -45,7 +45,16 @@ class BaseConverter:
        self.valid_languages = valid_languages
    @abstractmethod
-    def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
+    def convert(self, file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any]:
        """
        Convert a file to a dictionary containing the text and any associated meta data.
        File converters may extract file meta like name or size. In addition to it, user
        supplied meta data like author, url, external IDs can be supplied as a dictionary.
        :param file_path: path of the file to convert
        :param meta: dictionary of meta data key-value pairs to append in the returned document.
        """
        pass
    def validate_language(self, text: str) -> bool:
--- a/haystack/file_converter/docx.py
+++ b/haystack/file_converter/docx.py
@ -1,14 +1,16 @@
 from haystack.file_converter.base import BaseConverter
 import logging
 from pathlib import Path
-from typing import List, Dict, Optional, Any, Tuple
+from typing import Dict, Optional, Any
 import docx
 from haystack.file_converter.base import BaseConverter
 logger = logging.getLogger(__name__)
 class DocxToTextConverter(BaseConverter):
-    def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
+    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
        """
        Extract text from a .docx file.
        Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here.
@ -17,13 +19,8 @@ class DocxToTextConverter(BaseConverter):
        :param file_path: Path to the .docx file you want to convert
        """
-        #TODO We might want to join small passages here (e.g. titles)
+        file = docx.Document(file_path)  # Creating word reader object.
-        #TODO Investigate if there's a workaround to extract on a page level rather than passage level
+        paragraphs = [para.text for para in file.paragraphs]
-        #  (e.g. in the test sample it seemed that page breaks resulted in a paragraphs with only a "\n"
+        text = "".join(paragraphs)
-
+        document = {"text": text, "meta": meta}
-        doc = docx.Document(file_path)  # Creating word reader object.
+        return document
        fullText = []
        for para in doc.paragraphs:
            if para.text.strip() != "":
                fullText.append(para.text)
        return fullText, None
--- a/haystack/file_converter/pdf.py
+++ b/haystack/file_converter/pdf.py
@ -2,7 +2,7 @@ import logging
 import re
 import subprocess
 from pathlib import Path
-from typing import List, Optional, Dict, Tuple, Any
+from typing import List, Optional, Dict, Any
 from haystack.file_converter.base import BaseConverter
@ -60,7 +60,7 @@ class PDFToTextConverter(BaseConverter):
            valid_languages=valid_languages,
        )
-    def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
+    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
        pages = self._read_pdf(file_path, layout=False)
@ -114,7 +114,9 @@ class PDFToTextConverter(BaseConverter):
            )
            logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
-        return cleaned_pages, None
+        text = "\f".join(cleaned_pages)
        document = {"text": text, "meta": meta}
        return document
    def _read_pdf(self, file_path: Path, layout: bool) -> List[str]:
        """
--- a/haystack/file_converter/tika.py
+++ b/haystack/file_converter/tika.py
@ -81,7 +81,7 @@ class TikaConverter(BaseConverter):
            valid_languages=valid_languages,
        )
-    def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
+    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
        """
        :param file_path: Path of file to be converted.
@ -132,4 +132,6 @@ class TikaConverter(BaseConverter):
            )
            logger.info(f"Removed header '{header}' and footer '{footer}' in {file_path}")
-        return cleaned_pages, parsed["metadata"]
+        text = "\f".join(cleaned_pages)
        document = {"text": text, "meta": {**parsed["metadata"], **(meta or {})}}
        return document
--- a/haystack/file_converter/txt.py
+++ b/haystack/file_converter/txt.py
@ -1,7 +1,7 @@
 import logging
 import re
 from pathlib import Path
-from typing import List, Optional, Tuple, Any, Dict
+from typing import List, Optional, Any, Dict
 from haystack.file_converter.base import BaseConverter
@ -44,7 +44,7 @@ class TextConverter(BaseConverter):
            valid_languages=valid_languages,
        )
-    def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
+    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
        with open(file_path) as f:
            text = f.read()
            pages = text.split("\f")
@ -89,5 +89,7 @@ class TextConverter(BaseConverter):
            )
            logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
-        return cleaned_pages, None
+        text = "".join(pages)
        document = {"text": text, "meta": meta}
        return document
--- a/haystack/preprocessor/utils.py
+++ b/haystack/preprocessor/utils.py
@ -97,8 +97,8 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None,
            with open(path) as doc:
                text = doc.read()
        elif path.suffix.lower() == ".pdf" and pdf_converter:
-            pages, _ = pdf_converter.extract_pages(path)
+            document = pdf_converter.convert(path)
-            text = "\n".join(pages)
+            text = document["text"]
        else:
            raise Exception(f"Indexing of {path.suffix} files is not currently supported.")
@ -138,10 +138,11 @@ def tika_convert_files_to_dicts(
    documents = []
    for path in file_paths:
-        pages, meta = converter.extract_pages(path)
+        document = converter.convert(path)
-        meta = meta or {}
+        meta = document["meta"] or {}
        meta["name"] = path.name
-        text = ' '.join(pages)
+        text = document["text"]
        pages = text.split("\f")
        if split_paragraphs:
            if pages:
--- a/rest_api/controller/file_upload.py
+++ b/rest_api/controller/file_upload.py
@ -63,7 +63,7 @@ def upload_file_to_document_store(
                remove_header_footer=remove_header_footer,
                valid_languages=valid_languages,
            )
-            pages = pdf_converter.extract_pages(file_path)
+            document = pdf_converter.convert(file_path)
        elif file.filename.split(".")[-1].lower() == "txt":
            txt_converter = TextConverter(
                remove_numeric_tables=remove_numeric_tables,
@ -72,12 +72,12 @@ def upload_file_to_document_store(
                remove_header_footer=remove_header_footer,
                valid_languages=valid_languages,
            )
-            pages = txt_converter.extract_pages(file_path)
+            document = txt_converter.convert(file_path)
        else:
            raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.")
-        document = {TEXT_FIELD_NAME: "\n".join(pages), "name": file.filename}
+        document_to_write = {TEXT_FIELD_NAME: document["text"], "name": file.filename}
-        document_store.write_documents([document])
+        document_store.write_documents([document_to_write])
        return "File upload was successful."
    finally:
        file.file.close()
--- a/test/test_docx_conversion.py
+++ b/test/test_docx_conversion.py
@ -3,8 +3,7 @@ from pathlib import Path
 from haystack.file_converter.docx import DocxToTextConverter
-def test_extract_pages():
+def test_convert():
    converter = DocxToTextConverter()
-    paragraphs, _ = converter.extract_pages(file_path=Path("samples/docx/sample_docx.docx"))
+    document = converter.convert(file_path=Path("samples/docx/sample_docx.docx"))
-    assert len(paragraphs) == 8  # Sample has 8 Paragraphs
+    assert document["text"].startswith("Sample Docx File")
    assert paragraphs[1] == 'The US has "passed the peak" on new coronavirus cases, President Donald Trump said and predicted that some states would reopen this month.'
--- a/test/test_pdf_conversion.py
+++ b/test/test_pdf_conversion.py
@ -7,9 +7,10 @@ from haystack.file_converter.tika import TikaConverter
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
-def test_extract_pages(Converter, xpdf_fixture):
+def test_convert(Converter, xpdf_fixture):
    converter = Converter()
-    pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
    pages = document["text"].split("\f")
    assert len(pages) == 4  # the sample PDF file has four pages.
    assert pages[0] != ""  # the page 1 of PDF contains text.
    assert pages[2] == ""  # the page 3 of PDF file is empty.
@ -18,8 +19,8 @@ def test_extract_pages(Converter, xpdf_fixture):
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
 def test_table_removal(Converter, xpdf_fixture):
    converter = Converter(remove_numeric_tables=True)
-    pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
-
+    pages = document["text"].split("\f")
    # assert numeric rows are removed from the table.
    assert "324" not in pages[0]
    assert "54x growth" not in pages[0]
@ -31,11 +32,11 @@ def test_table_removal(Converter, xpdf_fixture):
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
 def test_language_validation(Converter, xpdf_fixture, caplog):
    converter = Converter(valid_languages=["en"])
-    pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+    converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
    assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text
    converter = Converter(valid_languages=["de"])
-    pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+    converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
    assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text
@ -44,12 +45,15 @@ def test_header_footer_removal(Converter, xpdf_fixture):
    converter = Converter(remove_header_footer=True)
    converter_no_removal = Converter(remove_header_footer=False)
-    pages1, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))  # file contains no header/footer
+    document1 = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))  # file contains no header/footer
-    pages2, _ = converter_no_removal.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))  # file contains no header/footer
+    document2 = converter_no_removal.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))  # file contains no header/footer
    pages1 = document1["text"].split("\f")
    pages2 = document2["text"].split("\f")
    for p1, p2 in zip(pages1, pages2):
        assert p2 == p2
-    pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_2.pdf"))  # file contains header and footer
+    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_2.pdf"))  # file contains header and footer
    pages = document["text"].split("\f")
    assert len(pages) == 4
    for page in pages:
        assert "This is a header." not in page