Refactor file converter interface (#393)

This commit is contained in:
Tanay Soni 2020-09-18 10:42:13 +02:00 committed by GitHub
parent 4e46d9d176
commit 3399fc784d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 60 additions and 44 deletions

View File

@ -45,7 +45,16 @@ class BaseConverter:
self.valid_languages = valid_languages self.valid_languages = valid_languages
@abstractmethod @abstractmethod
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]: def convert(self, file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any]:
"""
Convert a file to a dictionary containing the text and any associated meta data.
File converters may extract file meta like name or size. In addition to it, user
supplied meta data like author, url, external IDs can be supplied as a dictionary.
:param file_path: path of the file to convert
:param meta: dictionary of meta data key-value pairs to append in the returned document.
"""
pass pass
def validate_language(self, text: str) -> bool: def validate_language(self, text: str) -> bool:

View File

@ -1,14 +1,16 @@
from haystack.file_converter.base import BaseConverter
import logging import logging
from pathlib import Path from pathlib import Path
from typing import List, Dict, Optional, Any, Tuple from typing import Dict, Optional, Any
import docx import docx
from haystack.file_converter.base import BaseConverter
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class DocxToTextConverter(BaseConverter): class DocxToTextConverter(BaseConverter):
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]: def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
""" """
Extract text from a .docx file. Extract text from a .docx file.
Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here. Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here.
@ -17,13 +19,8 @@ class DocxToTextConverter(BaseConverter):
:param file_path: Path to the .docx file you want to convert :param file_path: Path to the .docx file you want to convert
""" """
#TODO We might want to join small passages here (e.g. titles) file = docx.Document(file_path) # Creating word reader object.
#TODO Investigate if there's a workaround to extract on a page level rather than passage level paragraphs = [para.text for para in file.paragraphs]
# (e.g. in the test sample it seemed that page breaks resulted in a paragraphs with only a "\n" text = "".join(paragraphs)
document = {"text": text, "meta": meta}
doc = docx.Document(file_path) # Creating word reader object. return document
fullText = []
for para in doc.paragraphs:
if para.text.strip() != "":
fullText.append(para.text)
return fullText, None

View File

@ -2,7 +2,7 @@ import logging
import re import re
import subprocess import subprocess
from pathlib import Path from pathlib import Path
from typing import List, Optional, Dict, Tuple, Any from typing import List, Optional, Dict, Any
from haystack.file_converter.base import BaseConverter from haystack.file_converter.base import BaseConverter
@ -60,7 +60,7 @@ class PDFToTextConverter(BaseConverter):
valid_languages=valid_languages, valid_languages=valid_languages,
) )
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]: def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
pages = self._read_pdf(file_path, layout=False) pages = self._read_pdf(file_path, layout=False)
@ -114,7 +114,9 @@ class PDFToTextConverter(BaseConverter):
) )
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}") logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
return cleaned_pages, None text = "\f".join(cleaned_pages)
document = {"text": text, "meta": meta}
return document
def _read_pdf(self, file_path: Path, layout: bool) -> List[str]: def _read_pdf(self, file_path: Path, layout: bool) -> List[str]:
""" """

View File

@ -81,7 +81,7 @@ class TikaConverter(BaseConverter):
valid_languages=valid_languages, valid_languages=valid_languages,
) )
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]: def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
""" """
:param file_path: Path of file to be converted. :param file_path: Path of file to be converted.
@ -132,4 +132,6 @@ class TikaConverter(BaseConverter):
) )
logger.info(f"Removed header '{header}' and footer '{footer}' in {file_path}") logger.info(f"Removed header '{header}' and footer '{footer}' in {file_path}")
return cleaned_pages, parsed["metadata"] text = "\f".join(cleaned_pages)
document = {"text": text, "meta": {**parsed["metadata"], **(meta or {})}}
return document

View File

@ -1,7 +1,7 @@
import logging import logging
import re import re
from pathlib import Path from pathlib import Path
from typing import List, Optional, Tuple, Any, Dict from typing import List, Optional, Any, Dict
from haystack.file_converter.base import BaseConverter from haystack.file_converter.base import BaseConverter
@ -44,7 +44,7 @@ class TextConverter(BaseConverter):
valid_languages=valid_languages, valid_languages=valid_languages,
) )
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]: def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
with open(file_path) as f: with open(file_path) as f:
text = f.read() text = f.read()
pages = text.split("\f") pages = text.split("\f")
@ -89,5 +89,7 @@ class TextConverter(BaseConverter):
) )
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}") logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
return cleaned_pages, None text = "".join(pages)
document = {"text": text, "meta": meta}
return document

View File

@ -97,8 +97,8 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None,
with open(path) as doc: with open(path) as doc:
text = doc.read() text = doc.read()
elif path.suffix.lower() == ".pdf" and pdf_converter: elif path.suffix.lower() == ".pdf" and pdf_converter:
pages, _ = pdf_converter.extract_pages(path) document = pdf_converter.convert(path)
text = "\n".join(pages) text = document["text"]
else: else:
raise Exception(f"Indexing of {path.suffix} files is not currently supported.") raise Exception(f"Indexing of {path.suffix} files is not currently supported.")
@ -138,10 +138,11 @@ def tika_convert_files_to_dicts(
documents = [] documents = []
for path in file_paths: for path in file_paths:
pages, meta = converter.extract_pages(path) document = converter.convert(path)
meta = meta or {} meta = document["meta"] or {}
meta["name"] = path.name meta["name"] = path.name
text = ' '.join(pages) text = document["text"]
pages = text.split("\f")
if split_paragraphs: if split_paragraphs:
if pages: if pages:

View File

@ -63,7 +63,7 @@ def upload_file_to_document_store(
remove_header_footer=remove_header_footer, remove_header_footer=remove_header_footer,
valid_languages=valid_languages, valid_languages=valid_languages,
) )
pages = pdf_converter.extract_pages(file_path) document = pdf_converter.convert(file_path)
elif file.filename.split(".")[-1].lower() == "txt": elif file.filename.split(".")[-1].lower() == "txt":
txt_converter = TextConverter( txt_converter = TextConverter(
remove_numeric_tables=remove_numeric_tables, remove_numeric_tables=remove_numeric_tables,
@ -72,12 +72,12 @@ def upload_file_to_document_store(
remove_header_footer=remove_header_footer, remove_header_footer=remove_header_footer,
valid_languages=valid_languages, valid_languages=valid_languages,
) )
pages = txt_converter.extract_pages(file_path) document = txt_converter.convert(file_path)
else: else:
raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.") raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.")
document = {TEXT_FIELD_NAME: "\n".join(pages), "name": file.filename} document_to_write = {TEXT_FIELD_NAME: document["text"], "name": file.filename}
document_store.write_documents([document]) document_store.write_documents([document_to_write])
return "File upload was successful." return "File upload was successful."
finally: finally:
file.file.close() file.file.close()

View File

@ -3,8 +3,7 @@ from pathlib import Path
from haystack.file_converter.docx import DocxToTextConverter from haystack.file_converter.docx import DocxToTextConverter
def test_extract_pages(): def test_convert():
converter = DocxToTextConverter() converter = DocxToTextConverter()
paragraphs, _ = converter.extract_pages(file_path=Path("samples/docx/sample_docx.docx")) document = converter.convert(file_path=Path("samples/docx/sample_docx.docx"))
assert len(paragraphs) == 8 # Sample has 8 Paragraphs assert document["text"].startswith("Sample Docx File")
assert paragraphs[1] == 'The US has "passed the peak" on new coronavirus cases, President Donald Trump said and predicted that some states would reopen this month.'

View File

@ -7,9 +7,10 @@ from haystack.file_converter.tika import TikaConverter
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter]) @pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
def test_extract_pages(Converter, xpdf_fixture): def test_convert(Converter, xpdf_fixture):
converter = Converter() converter = Converter()
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
pages = document["text"].split("\f")
assert len(pages) == 4 # the sample PDF file has four pages. assert len(pages) == 4 # the sample PDF file has four pages.
assert pages[0] != "" # the page 1 of PDF contains text. assert pages[0] != "" # the page 1 of PDF contains text.
assert pages[2] == "" # the page 3 of PDF file is empty. assert pages[2] == "" # the page 3 of PDF file is empty.
@ -18,8 +19,8 @@ def test_extract_pages(Converter, xpdf_fixture):
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter]) @pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
def test_table_removal(Converter, xpdf_fixture): def test_table_removal(Converter, xpdf_fixture):
converter = Converter(remove_numeric_tables=True) converter = Converter(remove_numeric_tables=True)
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
pages = document["text"].split("\f")
# assert numeric rows are removed from the table. # assert numeric rows are removed from the table.
assert "324" not in pages[0] assert "324" not in pages[0]
assert "54x growth" not in pages[0] assert "54x growth" not in pages[0]
@ -31,11 +32,11 @@ def test_table_removal(Converter, xpdf_fixture):
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter]) @pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
def test_language_validation(Converter, xpdf_fixture, caplog): def test_language_validation(Converter, xpdf_fixture, caplog):
converter = Converter(valid_languages=["en"]) converter = Converter(valid_languages=["en"])
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text
converter = Converter(valid_languages=["de"]) converter = Converter(valid_languages=["de"])
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text
@ -44,12 +45,15 @@ def test_header_footer_removal(Converter, xpdf_fixture):
converter = Converter(remove_header_footer=True) converter = Converter(remove_header_footer=True)
converter_no_removal = Converter(remove_header_footer=False) converter_no_removal = Converter(remove_header_footer=False)
pages1, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer document1 = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
pages2, _ = converter_no_removal.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer document2 = converter_no_removal.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
pages1 = document1["text"].split("\f")
pages2 = document2["text"].split("\f")
for p1, p2 in zip(pages1, pages2): for p1, p2 in zip(pages1, pages2):
assert p2 == p2 assert p2 == p2
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_2.pdf")) # file contains header and footer document = converter.convert(file_path=Path("samples/pdf/sample_pdf_2.pdf")) # file contains header and footer
pages = document["text"].split("\f")
assert len(pages) == 4 assert len(pages) == 4
for page in pages: for page in pages:
assert "This is a header." not in page assert "This is a header." not in page