From 3399fc784d78b5cf0969a5c46e7954e29bc4fcda Mon Sep 17 00:00:00 2001 From: Tanay Soni Date: Fri, 18 Sep 2020 10:42:13 +0200 Subject: [PATCH] Refactor file converter interface (#393) --- haystack/file_converter/base.py | 11 ++++++++++- haystack/file_converter/docx.py | 23 ++++++++++------------- haystack/file_converter/pdf.py | 8 +++++--- haystack/file_converter/tika.py | 6 ++++-- haystack/file_converter/txt.py | 8 +++++--- haystack/preprocessor/utils.py | 11 ++++++----- rest_api/controller/file_upload.py | 8 ++++---- test/test_docx_conversion.py | 7 +++---- test/test_pdf_conversion.py | 22 +++++++++++++--------- 9 files changed, 60 insertions(+), 44 deletions(-) diff --git a/haystack/file_converter/base.py b/haystack/file_converter/base.py index 483af3c19..aac4e7334 100644 --- a/haystack/file_converter/base.py +++ b/haystack/file_converter/base.py @@ -45,7 +45,16 @@ class BaseConverter: self.valid_languages = valid_languages @abstractmethod - def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]: + def convert(self, file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any]: + """ + Convert a file to a dictionary containing the text and any associated meta data. + + File converters may extract file meta like name or size. In addition to it, user + supplied meta data like author, url, external IDs can be supplied as a dictionary. + + :param file_path: path of the file to convert + :param meta: dictionary of meta data key-value pairs to append in the returned document. + """ pass def validate_language(self, text: str) -> bool: diff --git a/haystack/file_converter/docx.py b/haystack/file_converter/docx.py index 43d3f8992..4d1842e2f 100644 --- a/haystack/file_converter/docx.py +++ b/haystack/file_converter/docx.py @@ -1,14 +1,16 @@ -from haystack.file_converter.base import BaseConverter import logging from pathlib import Path -from typing import List, Dict, Optional, Any, Tuple +from typing import Dict, Optional, Any + import docx +from haystack.file_converter.base import BaseConverter + logger = logging.getLogger(__name__) class DocxToTextConverter(BaseConverter): - def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]: + def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]: """ Extract text from a .docx file. Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here. @@ -17,13 +19,8 @@ class DocxToTextConverter(BaseConverter): :param file_path: Path to the .docx file you want to convert """ - #TODO We might want to join small passages here (e.g. titles) - #TODO Investigate if there's a workaround to extract on a page level rather than passage level - # (e.g. in the test sample it seemed that page breaks resulted in a paragraphs with only a "\n" - - doc = docx.Document(file_path) # Creating word reader object. - fullText = [] - for para in doc.paragraphs: - if para.text.strip() != "": - fullText.append(para.text) - return fullText, None + file = docx.Document(file_path) # Creating word reader object. + paragraphs = [para.text for para in file.paragraphs] + text = "".join(paragraphs) + document = {"text": text, "meta": meta} + return document diff --git a/haystack/file_converter/pdf.py b/haystack/file_converter/pdf.py index 8ee4714f8..8c2d6c2e4 100644 --- a/haystack/file_converter/pdf.py +++ b/haystack/file_converter/pdf.py @@ -2,7 +2,7 @@ import logging import re import subprocess from pathlib import Path -from typing import List, Optional, Dict, Tuple, Any +from typing import List, Optional, Dict, Any from haystack.file_converter.base import BaseConverter @@ -60,7 +60,7 @@ class PDFToTextConverter(BaseConverter): valid_languages=valid_languages, ) - def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]: + def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]: pages = self._read_pdf(file_path, layout=False) @@ -114,7 +114,9 @@ class PDFToTextConverter(BaseConverter): ) logger.info(f"Removed header '{header}' and footer {footer} in {file_path}") - return cleaned_pages, None + text = "\f".join(cleaned_pages) + document = {"text": text, "meta": meta} + return document def _read_pdf(self, file_path: Path, layout: bool) -> List[str]: """ diff --git a/haystack/file_converter/tika.py b/haystack/file_converter/tika.py index 71b427481..7e68ec810 100644 --- a/haystack/file_converter/tika.py +++ b/haystack/file_converter/tika.py @@ -81,7 +81,7 @@ class TikaConverter(BaseConverter): valid_languages=valid_languages, ) - def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]: + def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]: """ :param file_path: Path of file to be converted. @@ -132,4 +132,6 @@ class TikaConverter(BaseConverter): ) logger.info(f"Removed header '{header}' and footer '{footer}' in {file_path}") - return cleaned_pages, parsed["metadata"] + text = "\f".join(cleaned_pages) + document = {"text": text, "meta": {**parsed["metadata"], **(meta or {})}} + return document diff --git a/haystack/file_converter/txt.py b/haystack/file_converter/txt.py index 6e5f6dece..695627351 100644 --- a/haystack/file_converter/txt.py +++ b/haystack/file_converter/txt.py @@ -1,7 +1,7 @@ import logging import re from pathlib import Path -from typing import List, Optional, Tuple, Any, Dict +from typing import List, Optional, Any, Dict from haystack.file_converter.base import BaseConverter @@ -44,7 +44,7 @@ class TextConverter(BaseConverter): valid_languages=valid_languages, ) - def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]: + def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]: with open(file_path) as f: text = f.read() pages = text.split("\f") @@ -89,5 +89,7 @@ class TextConverter(BaseConverter): ) logger.info(f"Removed header '{header}' and footer {footer} in {file_path}") - return cleaned_pages, None + text = "".join(pages) + document = {"text": text, "meta": meta} + return document diff --git a/haystack/preprocessor/utils.py b/haystack/preprocessor/utils.py index 323b01cf6..912ee85f3 100644 --- a/haystack/preprocessor/utils.py +++ b/haystack/preprocessor/utils.py @@ -97,8 +97,8 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None, with open(path) as doc: text = doc.read() elif path.suffix.lower() == ".pdf" and pdf_converter: - pages, _ = pdf_converter.extract_pages(path) - text = "\n".join(pages) + document = pdf_converter.convert(path) + text = document["text"] else: raise Exception(f"Indexing of {path.suffix} files is not currently supported.") @@ -138,10 +138,11 @@ def tika_convert_files_to_dicts( documents = [] for path in file_paths: - pages, meta = converter.extract_pages(path) - meta = meta or {} + document = converter.convert(path) + meta = document["meta"] or {} meta["name"] = path.name - text = ' '.join(pages) + text = document["text"] + pages = text.split("\f") if split_paragraphs: if pages: diff --git a/rest_api/controller/file_upload.py b/rest_api/controller/file_upload.py index c79bf02a7..a5773832b 100644 --- a/rest_api/controller/file_upload.py +++ b/rest_api/controller/file_upload.py @@ -63,7 +63,7 @@ def upload_file_to_document_store( remove_header_footer=remove_header_footer, valid_languages=valid_languages, ) - pages = pdf_converter.extract_pages(file_path) + document = pdf_converter.convert(file_path) elif file.filename.split(".")[-1].lower() == "txt": txt_converter = TextConverter( remove_numeric_tables=remove_numeric_tables, @@ -72,12 +72,12 @@ def upload_file_to_document_store( remove_header_footer=remove_header_footer, valid_languages=valid_languages, ) - pages = txt_converter.extract_pages(file_path) + document = txt_converter.convert(file_path) else: raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.") - document = {TEXT_FIELD_NAME: "\n".join(pages), "name": file.filename} - document_store.write_documents([document]) + document_to_write = {TEXT_FIELD_NAME: document["text"], "name": file.filename} + document_store.write_documents([document_to_write]) return "File upload was successful." finally: file.file.close() diff --git a/test/test_docx_conversion.py b/test/test_docx_conversion.py index 88c834c72..92ab482f8 100644 --- a/test/test_docx_conversion.py +++ b/test/test_docx_conversion.py @@ -3,8 +3,7 @@ from pathlib import Path from haystack.file_converter.docx import DocxToTextConverter -def test_extract_pages(): +def test_convert(): converter = DocxToTextConverter() - paragraphs, _ = converter.extract_pages(file_path=Path("samples/docx/sample_docx.docx")) - assert len(paragraphs) == 8 # Sample has 8 Paragraphs - assert paragraphs[1] == 'The US has "passed the peak" on new coronavirus cases, President Donald Trump said and predicted that some states would reopen this month.' + document = converter.convert(file_path=Path("samples/docx/sample_docx.docx")) + assert document["text"].startswith("Sample Docx File") diff --git a/test/test_pdf_conversion.py b/test/test_pdf_conversion.py index b785bb605..93e06bcc5 100644 --- a/test/test_pdf_conversion.py +++ b/test/test_pdf_conversion.py @@ -7,9 +7,10 @@ from haystack.file_converter.tika import TikaConverter @pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter]) -def test_extract_pages(Converter, xpdf_fixture): +def test_convert(Converter, xpdf_fixture): converter = Converter() - pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) + document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) + pages = document["text"].split("\f") assert len(pages) == 4 # the sample PDF file has four pages. assert pages[0] != "" # the page 1 of PDF contains text. assert pages[2] == "" # the page 3 of PDF file is empty. @@ -18,8 +19,8 @@ def test_extract_pages(Converter, xpdf_fixture): @pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter]) def test_table_removal(Converter, xpdf_fixture): converter = Converter(remove_numeric_tables=True) - pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) - + document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) + pages = document["text"].split("\f") # assert numeric rows are removed from the table. assert "324" not in pages[0] assert "54x growth" not in pages[0] @@ -31,11 +32,11 @@ def test_table_removal(Converter, xpdf_fixture): @pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter]) def test_language_validation(Converter, xpdf_fixture, caplog): converter = Converter(valid_languages=["en"]) - pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) + converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text converter = Converter(valid_languages=["de"]) - pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) + converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text @@ -44,12 +45,15 @@ def test_header_footer_removal(Converter, xpdf_fixture): converter = Converter(remove_header_footer=True) converter_no_removal = Converter(remove_header_footer=False) - pages1, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer - pages2, _ = converter_no_removal.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer + document1 = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer + document2 = converter_no_removal.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer + pages1 = document1["text"].split("\f") + pages2 = document2["text"].split("\f") for p1, p2 in zip(pages1, pages2): assert p2 == p2 - pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_2.pdf")) # file contains header and footer + document = converter.convert(file_path=Path("samples/pdf/sample_pdf_2.pdf")) # file contains header and footer + pages = document["text"].split("\f") assert len(pages) == 4 for page in pages: assert "This is a header." not in page