mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-27 18:06:17 +00:00
Refactor file converter interface (#393)
This commit is contained in:
parent
4e46d9d176
commit
3399fc784d
@ -45,7 +45,16 @@ class BaseConverter:
|
||||
self.valid_languages = valid_languages
|
||||
|
||||
@abstractmethod
|
||||
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
|
||||
def convert(self, file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert a file to a dictionary containing the text and any associated meta data.
|
||||
|
||||
File converters may extract file meta like name or size. In addition to it, user
|
||||
supplied meta data like author, url, external IDs can be supplied as a dictionary.
|
||||
|
||||
:param file_path: path of the file to convert
|
||||
:param meta: dictionary of meta data key-value pairs to append in the returned document.
|
||||
"""
|
||||
pass
|
||||
|
||||
def validate_language(self, text: str) -> bool:
|
||||
|
@ -1,14 +1,16 @@
|
||||
from haystack.file_converter.base import BaseConverter
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional, Any, Tuple
|
||||
from typing import Dict, Optional, Any
|
||||
|
||||
import docx
|
||||
|
||||
from haystack.file_converter.base import BaseConverter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocxToTextConverter(BaseConverter):
|
||||
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
|
||||
def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract text from a .docx file.
|
||||
Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here.
|
||||
@ -17,13 +19,8 @@ class DocxToTextConverter(BaseConverter):
|
||||
:param file_path: Path to the .docx file you want to convert
|
||||
"""
|
||||
|
||||
#TODO We might want to join small passages here (e.g. titles)
|
||||
#TODO Investigate if there's a workaround to extract on a page level rather than passage level
|
||||
# (e.g. in the test sample it seemed that page breaks resulted in a paragraphs with only a "\n"
|
||||
|
||||
doc = docx.Document(file_path) # Creating word reader object.
|
||||
fullText = []
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip() != "":
|
||||
fullText.append(para.text)
|
||||
return fullText, None
|
||||
file = docx.Document(file_path) # Creating word reader object.
|
||||
paragraphs = [para.text for para in file.paragraphs]
|
||||
text = "".join(paragraphs)
|
||||
document = {"text": text, "meta": meta}
|
||||
return document
|
||||
|
@ -2,7 +2,7 @@ import logging
|
||||
import re
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, Tuple, Any
|
||||
from typing import List, Optional, Dict, Any
|
||||
|
||||
from haystack.file_converter.base import BaseConverter
|
||||
|
||||
@ -60,7 +60,7 @@ class PDFToTextConverter(BaseConverter):
|
||||
valid_languages=valid_languages,
|
||||
)
|
||||
|
||||
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
|
||||
def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
|
||||
|
||||
pages = self._read_pdf(file_path, layout=False)
|
||||
|
||||
@ -114,7 +114,9 @@ class PDFToTextConverter(BaseConverter):
|
||||
)
|
||||
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
|
||||
|
||||
return cleaned_pages, None
|
||||
text = "\f".join(cleaned_pages)
|
||||
document = {"text": text, "meta": meta}
|
||||
return document
|
||||
|
||||
def _read_pdf(self, file_path: Path, layout: bool) -> List[str]:
|
||||
"""
|
||||
|
@ -81,7 +81,7 @@ class TikaConverter(BaseConverter):
|
||||
valid_languages=valid_languages,
|
||||
)
|
||||
|
||||
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
|
||||
def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
:param file_path: Path of file to be converted.
|
||||
|
||||
@ -132,4 +132,6 @@ class TikaConverter(BaseConverter):
|
||||
)
|
||||
logger.info(f"Removed header '{header}' and footer '{footer}' in {file_path}")
|
||||
|
||||
return cleaned_pages, parsed["metadata"]
|
||||
text = "\f".join(cleaned_pages)
|
||||
document = {"text": text, "meta": {**parsed["metadata"], **(meta or {})}}
|
||||
return document
|
||||
|
@ -1,7 +1,7 @@
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple, Any, Dict
|
||||
from typing import List, Optional, Any, Dict
|
||||
|
||||
from haystack.file_converter.base import BaseConverter
|
||||
|
||||
@ -44,7 +44,7 @@ class TextConverter(BaseConverter):
|
||||
valid_languages=valid_languages,
|
||||
)
|
||||
|
||||
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
|
||||
def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
|
||||
with open(file_path) as f:
|
||||
text = f.read()
|
||||
pages = text.split("\f")
|
||||
@ -89,5 +89,7 @@ class TextConverter(BaseConverter):
|
||||
)
|
||||
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
|
||||
|
||||
return cleaned_pages, None
|
||||
text = "".join(pages)
|
||||
document = {"text": text, "meta": meta}
|
||||
return document
|
||||
|
||||
|
@ -97,8 +97,8 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None,
|
||||
with open(path) as doc:
|
||||
text = doc.read()
|
||||
elif path.suffix.lower() == ".pdf" and pdf_converter:
|
||||
pages, _ = pdf_converter.extract_pages(path)
|
||||
text = "\n".join(pages)
|
||||
document = pdf_converter.convert(path)
|
||||
text = document["text"]
|
||||
else:
|
||||
raise Exception(f"Indexing of {path.suffix} files is not currently supported.")
|
||||
|
||||
@ -138,10 +138,11 @@ def tika_convert_files_to_dicts(
|
||||
|
||||
documents = []
|
||||
for path in file_paths:
|
||||
pages, meta = converter.extract_pages(path)
|
||||
meta = meta or {}
|
||||
document = converter.convert(path)
|
||||
meta = document["meta"] or {}
|
||||
meta["name"] = path.name
|
||||
text = ' '.join(pages)
|
||||
text = document["text"]
|
||||
pages = text.split("\f")
|
||||
|
||||
if split_paragraphs:
|
||||
if pages:
|
||||
|
@ -63,7 +63,7 @@ def upload_file_to_document_store(
|
||||
remove_header_footer=remove_header_footer,
|
||||
valid_languages=valid_languages,
|
||||
)
|
||||
pages = pdf_converter.extract_pages(file_path)
|
||||
document = pdf_converter.convert(file_path)
|
||||
elif file.filename.split(".")[-1].lower() == "txt":
|
||||
txt_converter = TextConverter(
|
||||
remove_numeric_tables=remove_numeric_tables,
|
||||
@ -72,12 +72,12 @@ def upload_file_to_document_store(
|
||||
remove_header_footer=remove_header_footer,
|
||||
valid_languages=valid_languages,
|
||||
)
|
||||
pages = txt_converter.extract_pages(file_path)
|
||||
document = txt_converter.convert(file_path)
|
||||
else:
|
||||
raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.")
|
||||
|
||||
document = {TEXT_FIELD_NAME: "\n".join(pages), "name": file.filename}
|
||||
document_store.write_documents([document])
|
||||
document_to_write = {TEXT_FIELD_NAME: document["text"], "name": file.filename}
|
||||
document_store.write_documents([document_to_write])
|
||||
return "File upload was successful."
|
||||
finally:
|
||||
file.file.close()
|
||||
|
@ -3,8 +3,7 @@ from pathlib import Path
|
||||
from haystack.file_converter.docx import DocxToTextConverter
|
||||
|
||||
|
||||
def test_extract_pages():
|
||||
def test_convert():
|
||||
converter = DocxToTextConverter()
|
||||
paragraphs, _ = converter.extract_pages(file_path=Path("samples/docx/sample_docx.docx"))
|
||||
assert len(paragraphs) == 8 # Sample has 8 Paragraphs
|
||||
assert paragraphs[1] == 'The US has "passed the peak" on new coronavirus cases, President Donald Trump said and predicted that some states would reopen this month.'
|
||||
document = converter.convert(file_path=Path("samples/docx/sample_docx.docx"))
|
||||
assert document["text"].startswith("Sample Docx File")
|
||||
|
@ -7,9 +7,10 @@ from haystack.file_converter.tika import TikaConverter
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
|
||||
def test_extract_pages(Converter, xpdf_fixture):
|
||||
def test_convert(Converter, xpdf_fixture):
|
||||
converter = Converter()
|
||||
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||
pages = document["text"].split("\f")
|
||||
assert len(pages) == 4 # the sample PDF file has four pages.
|
||||
assert pages[0] != "" # the page 1 of PDF contains text.
|
||||
assert pages[2] == "" # the page 3 of PDF file is empty.
|
||||
@ -18,8 +19,8 @@ def test_extract_pages(Converter, xpdf_fixture):
|
||||
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
|
||||
def test_table_removal(Converter, xpdf_fixture):
|
||||
converter = Converter(remove_numeric_tables=True)
|
||||
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||
|
||||
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||
pages = document["text"].split("\f")
|
||||
# assert numeric rows are removed from the table.
|
||||
assert "324" not in pages[0]
|
||||
assert "54x growth" not in pages[0]
|
||||
@ -31,11 +32,11 @@ def test_table_removal(Converter, xpdf_fixture):
|
||||
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
|
||||
def test_language_validation(Converter, xpdf_fixture, caplog):
|
||||
converter = Converter(valid_languages=["en"])
|
||||
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||
converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text
|
||||
|
||||
converter = Converter(valid_languages=["de"])
|
||||
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||
converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text
|
||||
|
||||
|
||||
@ -44,12 +45,15 @@ def test_header_footer_removal(Converter, xpdf_fixture):
|
||||
converter = Converter(remove_header_footer=True)
|
||||
converter_no_removal = Converter(remove_header_footer=False)
|
||||
|
||||
pages1, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
|
||||
pages2, _ = converter_no_removal.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
|
||||
document1 = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
|
||||
document2 = converter_no_removal.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
|
||||
pages1 = document1["text"].split("\f")
|
||||
pages2 = document2["text"].split("\f")
|
||||
for p1, p2 in zip(pages1, pages2):
|
||||
assert p2 == p2
|
||||
|
||||
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_2.pdf")) # file contains header and footer
|
||||
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_2.pdf")) # file contains header and footer
|
||||
pages = document["text"].split("\f")
|
||||
assert len(pages) == 4
|
||||
for page in pages:
|
||||
assert "This is a header." not in page
|
||||
|
Loading…
x
Reference in New Issue
Block a user