Refactor file converter interface (#393)

This commit is contained in:
Tanay Soni 2020-09-18 10:42:13 +02:00 committed by GitHub
parent 4e46d9d176
commit 3399fc784d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 60 additions and 44 deletions

View File

@ -45,7 +45,16 @@ class BaseConverter:
self.valid_languages = valid_languages
@abstractmethod
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
def convert(self, file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any]:
"""
Convert a file to a dictionary containing the text and any associated meta data.
File converters may extract file meta like name or size. In addition to it, user
supplied meta data like author, url, external IDs can be supplied as a dictionary.
:param file_path: path of the file to convert
:param meta: dictionary of meta data key-value pairs to append in the returned document.
"""
pass
def validate_language(self, text: str) -> bool:

View File

@ -1,14 +1,16 @@
from haystack.file_converter.base import BaseConverter
import logging
from pathlib import Path
from typing import List, Dict, Optional, Any, Tuple
from typing import Dict, Optional, Any
import docx
from haystack.file_converter.base import BaseConverter
logger = logging.getLogger(__name__)
class DocxToTextConverter(BaseConverter):
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
"""
Extract text from a .docx file.
Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here.
@ -17,13 +19,8 @@ class DocxToTextConverter(BaseConverter):
:param file_path: Path to the .docx file you want to convert
"""
#TODO We might want to join small passages here (e.g. titles)
#TODO Investigate if there's a workaround to extract on a page level rather than passage level
# (e.g. in the test sample it seemed that page breaks resulted in a paragraphs with only a "\n"
doc = docx.Document(file_path) # Creating word reader object.
fullText = []
for para in doc.paragraphs:
if para.text.strip() != "":
fullText.append(para.text)
return fullText, None
file = docx.Document(file_path) # Creating word reader object.
paragraphs = [para.text for para in file.paragraphs]
text = "".join(paragraphs)
document = {"text": text, "meta": meta}
return document

View File

@ -2,7 +2,7 @@ import logging
import re
import subprocess
from pathlib import Path
from typing import List, Optional, Dict, Tuple, Any
from typing import List, Optional, Dict, Any
from haystack.file_converter.base import BaseConverter
@ -60,7 +60,7 @@ class PDFToTextConverter(BaseConverter):
valid_languages=valid_languages,
)
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
pages = self._read_pdf(file_path, layout=False)
@ -114,7 +114,9 @@ class PDFToTextConverter(BaseConverter):
)
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
return cleaned_pages, None
text = "\f".join(cleaned_pages)
document = {"text": text, "meta": meta}
return document
def _read_pdf(self, file_path: Path, layout: bool) -> List[str]:
"""

View File

@ -81,7 +81,7 @@ class TikaConverter(BaseConverter):
valid_languages=valid_languages,
)
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
"""
:param file_path: Path of file to be converted.
@ -132,4 +132,6 @@ class TikaConverter(BaseConverter):
)
logger.info(f"Removed header '{header}' and footer '{footer}' in {file_path}")
return cleaned_pages, parsed["metadata"]
text = "\f".join(cleaned_pages)
document = {"text": text, "meta": {**parsed["metadata"], **(meta or {})}}
return document

View File

@ -1,7 +1,7 @@
import logging
import re
from pathlib import Path
from typing import List, Optional, Tuple, Any, Dict
from typing import List, Optional, Any, Dict
from haystack.file_converter.base import BaseConverter
@ -44,7 +44,7 @@ class TextConverter(BaseConverter):
valid_languages=valid_languages,
)
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
with open(file_path) as f:
text = f.read()
pages = text.split("\f")
@ -89,5 +89,7 @@ class TextConverter(BaseConverter):
)
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
return cleaned_pages, None
text = "".join(pages)
document = {"text": text, "meta": meta}
return document

View File

@ -97,8 +97,8 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None,
with open(path) as doc:
text = doc.read()
elif path.suffix.lower() == ".pdf" and pdf_converter:
pages, _ = pdf_converter.extract_pages(path)
text = "\n".join(pages)
document = pdf_converter.convert(path)
text = document["text"]
else:
raise Exception(f"Indexing of {path.suffix} files is not currently supported.")
@ -138,10 +138,11 @@ def tika_convert_files_to_dicts(
documents = []
for path in file_paths:
pages, meta = converter.extract_pages(path)
meta = meta or {}
document = converter.convert(path)
meta = document["meta"] or {}
meta["name"] = path.name
text = ' '.join(pages)
text = document["text"]
pages = text.split("\f")
if split_paragraphs:
if pages:

View File

@ -63,7 +63,7 @@ def upload_file_to_document_store(
remove_header_footer=remove_header_footer,
valid_languages=valid_languages,
)
pages = pdf_converter.extract_pages(file_path)
document = pdf_converter.convert(file_path)
elif file.filename.split(".")[-1].lower() == "txt":
txt_converter = TextConverter(
remove_numeric_tables=remove_numeric_tables,
@ -72,12 +72,12 @@ def upload_file_to_document_store(
remove_header_footer=remove_header_footer,
valid_languages=valid_languages,
)
pages = txt_converter.extract_pages(file_path)
document = txt_converter.convert(file_path)
else:
raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.")
document = {TEXT_FIELD_NAME: "\n".join(pages), "name": file.filename}
document_store.write_documents([document])
document_to_write = {TEXT_FIELD_NAME: document["text"], "name": file.filename}
document_store.write_documents([document_to_write])
return "File upload was successful."
finally:
file.file.close()

View File

@ -3,8 +3,7 @@ from pathlib import Path
from haystack.file_converter.docx import DocxToTextConverter
def test_extract_pages():
def test_convert():
converter = DocxToTextConverter()
paragraphs, _ = converter.extract_pages(file_path=Path("samples/docx/sample_docx.docx"))
assert len(paragraphs) == 8 # Sample has 8 Paragraphs
assert paragraphs[1] == 'The US has "passed the peak" on new coronavirus cases, President Donald Trump said and predicted that some states would reopen this month.'
document = converter.convert(file_path=Path("samples/docx/sample_docx.docx"))
assert document["text"].startswith("Sample Docx File")

View File

@ -7,9 +7,10 @@ from haystack.file_converter.tika import TikaConverter
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
def test_extract_pages(Converter, xpdf_fixture):
def test_convert(Converter, xpdf_fixture):
converter = Converter()
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
pages = document["text"].split("\f")
assert len(pages) == 4 # the sample PDF file has four pages.
assert pages[0] != "" # the page 1 of PDF contains text.
assert pages[2] == "" # the page 3 of PDF file is empty.
@ -18,8 +19,8 @@ def test_extract_pages(Converter, xpdf_fixture):
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
def test_table_removal(Converter, xpdf_fixture):
converter = Converter(remove_numeric_tables=True)
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
pages = document["text"].split("\f")
# assert numeric rows are removed from the table.
assert "324" not in pages[0]
assert "54x growth" not in pages[0]
@ -31,11 +32,11 @@ def test_table_removal(Converter, xpdf_fixture):
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
def test_language_validation(Converter, xpdf_fixture, caplog):
converter = Converter(valid_languages=["en"])
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text
converter = Converter(valid_languages=["de"])
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text
@ -44,12 +45,15 @@ def test_header_footer_removal(Converter, xpdf_fixture):
converter = Converter(remove_header_footer=True)
converter_no_removal = Converter(remove_header_footer=False)
pages1, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
pages2, _ = converter_no_removal.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
document1 = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
document2 = converter_no_removal.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
pages1 = document1["text"].split("\f")
pages2 = document2["text"].split("\f")
for p1, p2 in zip(pages1, pages2):
assert p2 == p2
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_2.pdf")) # file contains header and footer
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_2.pdf")) # file contains header and footer
pages = document["text"].split("\f")
assert len(pages) == 4
for page in pages:
assert "This is a header." not in page