mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-28 10:26:27 +00:00
Refactor file converter interface (#393)
This commit is contained in:
parent
4e46d9d176
commit
3399fc784d
@ -45,7 +45,16 @@ class BaseConverter:
|
|||||||
self.valid_languages = valid_languages
|
self.valid_languages = valid_languages
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
|
def convert(self, file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Convert a file to a dictionary containing the text and any associated meta data.
|
||||||
|
|
||||||
|
File converters may extract file meta like name or size. In addition to it, user
|
||||||
|
supplied meta data like author, url, external IDs can be supplied as a dictionary.
|
||||||
|
|
||||||
|
:param file_path: path of the file to convert
|
||||||
|
:param meta: dictionary of meta data key-value pairs to append in the returned document.
|
||||||
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def validate_language(self, text: str) -> bool:
|
def validate_language(self, text: str) -> bool:
|
||||||
|
@ -1,14 +1,16 @@
|
|||||||
from haystack.file_converter.base import BaseConverter
|
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Dict, Optional, Any, Tuple
|
from typing import Dict, Optional, Any
|
||||||
|
|
||||||
import docx
|
import docx
|
||||||
|
|
||||||
|
from haystack.file_converter.base import BaseConverter
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class DocxToTextConverter(BaseConverter):
|
class DocxToTextConverter(BaseConverter):
|
||||||
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
|
def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Extract text from a .docx file.
|
Extract text from a .docx file.
|
||||||
Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here.
|
Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here.
|
||||||
@ -17,13 +19,8 @@ class DocxToTextConverter(BaseConverter):
|
|||||||
:param file_path: Path to the .docx file you want to convert
|
:param file_path: Path to the .docx file you want to convert
|
||||||
"""
|
"""
|
||||||
|
|
||||||
#TODO We might want to join small passages here (e.g. titles)
|
file = docx.Document(file_path) # Creating word reader object.
|
||||||
#TODO Investigate if there's a workaround to extract on a page level rather than passage level
|
paragraphs = [para.text for para in file.paragraphs]
|
||||||
# (e.g. in the test sample it seemed that page breaks resulted in a paragraphs with only a "\n"
|
text = "".join(paragraphs)
|
||||||
|
document = {"text": text, "meta": meta}
|
||||||
doc = docx.Document(file_path) # Creating word reader object.
|
return document
|
||||||
fullText = []
|
|
||||||
for para in doc.paragraphs:
|
|
||||||
if para.text.strip() != "":
|
|
||||||
fullText.append(para.text)
|
|
||||||
return fullText, None
|
|
||||||
|
@ -2,7 +2,7 @@ import logging
|
|||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Dict, Tuple, Any
|
from typing import List, Optional, Dict, Any
|
||||||
|
|
||||||
from haystack.file_converter.base import BaseConverter
|
from haystack.file_converter.base import BaseConverter
|
||||||
|
|
||||||
@ -60,7 +60,7 @@ class PDFToTextConverter(BaseConverter):
|
|||||||
valid_languages=valid_languages,
|
valid_languages=valid_languages,
|
||||||
)
|
)
|
||||||
|
|
||||||
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
|
def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
|
||||||
|
|
||||||
pages = self._read_pdf(file_path, layout=False)
|
pages = self._read_pdf(file_path, layout=False)
|
||||||
|
|
||||||
@ -114,7 +114,9 @@ class PDFToTextConverter(BaseConverter):
|
|||||||
)
|
)
|
||||||
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
|
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
|
||||||
|
|
||||||
return cleaned_pages, None
|
text = "\f".join(cleaned_pages)
|
||||||
|
document = {"text": text, "meta": meta}
|
||||||
|
return document
|
||||||
|
|
||||||
def _read_pdf(self, file_path: Path, layout: bool) -> List[str]:
|
def _read_pdf(self, file_path: Path, layout: bool) -> List[str]:
|
||||||
"""
|
"""
|
||||||
|
@ -81,7 +81,7 @@ class TikaConverter(BaseConverter):
|
|||||||
valid_languages=valid_languages,
|
valid_languages=valid_languages,
|
||||||
)
|
)
|
||||||
|
|
||||||
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
|
def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
:param file_path: Path of file to be converted.
|
:param file_path: Path of file to be converted.
|
||||||
|
|
||||||
@ -132,4 +132,6 @@ class TikaConverter(BaseConverter):
|
|||||||
)
|
)
|
||||||
logger.info(f"Removed header '{header}' and footer '{footer}' in {file_path}")
|
logger.info(f"Removed header '{header}' and footer '{footer}' in {file_path}")
|
||||||
|
|
||||||
return cleaned_pages, parsed["metadata"]
|
text = "\f".join(cleaned_pages)
|
||||||
|
document = {"text": text, "meta": {**parsed["metadata"], **(meta or {})}}
|
||||||
|
return document
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Tuple, Any, Dict
|
from typing import List, Optional, Any, Dict
|
||||||
|
|
||||||
from haystack.file_converter.base import BaseConverter
|
from haystack.file_converter.base import BaseConverter
|
||||||
|
|
||||||
@ -44,7 +44,7 @@ class TextConverter(BaseConverter):
|
|||||||
valid_languages=valid_languages,
|
valid_languages=valid_languages,
|
||||||
)
|
)
|
||||||
|
|
||||||
def extract_pages(self, file_path: Path) -> Tuple[List[str], Optional[Dict[str, Any]]]:
|
def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
|
||||||
with open(file_path) as f:
|
with open(file_path) as f:
|
||||||
text = f.read()
|
text = f.read()
|
||||||
pages = text.split("\f")
|
pages = text.split("\f")
|
||||||
@ -89,5 +89,7 @@ class TextConverter(BaseConverter):
|
|||||||
)
|
)
|
||||||
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
|
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
|
||||||
|
|
||||||
return cleaned_pages, None
|
text = "".join(pages)
|
||||||
|
document = {"text": text, "meta": meta}
|
||||||
|
return document
|
||||||
|
|
||||||
|
@ -97,8 +97,8 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None,
|
|||||||
with open(path) as doc:
|
with open(path) as doc:
|
||||||
text = doc.read()
|
text = doc.read()
|
||||||
elif path.suffix.lower() == ".pdf" and pdf_converter:
|
elif path.suffix.lower() == ".pdf" and pdf_converter:
|
||||||
pages, _ = pdf_converter.extract_pages(path)
|
document = pdf_converter.convert(path)
|
||||||
text = "\n".join(pages)
|
text = document["text"]
|
||||||
else:
|
else:
|
||||||
raise Exception(f"Indexing of {path.suffix} files is not currently supported.")
|
raise Exception(f"Indexing of {path.suffix} files is not currently supported.")
|
||||||
|
|
||||||
@ -138,10 +138,11 @@ def tika_convert_files_to_dicts(
|
|||||||
|
|
||||||
documents = []
|
documents = []
|
||||||
for path in file_paths:
|
for path in file_paths:
|
||||||
pages, meta = converter.extract_pages(path)
|
document = converter.convert(path)
|
||||||
meta = meta or {}
|
meta = document["meta"] or {}
|
||||||
meta["name"] = path.name
|
meta["name"] = path.name
|
||||||
text = ' '.join(pages)
|
text = document["text"]
|
||||||
|
pages = text.split("\f")
|
||||||
|
|
||||||
if split_paragraphs:
|
if split_paragraphs:
|
||||||
if pages:
|
if pages:
|
||||||
|
@ -63,7 +63,7 @@ def upload_file_to_document_store(
|
|||||||
remove_header_footer=remove_header_footer,
|
remove_header_footer=remove_header_footer,
|
||||||
valid_languages=valid_languages,
|
valid_languages=valid_languages,
|
||||||
)
|
)
|
||||||
pages = pdf_converter.extract_pages(file_path)
|
document = pdf_converter.convert(file_path)
|
||||||
elif file.filename.split(".")[-1].lower() == "txt":
|
elif file.filename.split(".")[-1].lower() == "txt":
|
||||||
txt_converter = TextConverter(
|
txt_converter = TextConverter(
|
||||||
remove_numeric_tables=remove_numeric_tables,
|
remove_numeric_tables=remove_numeric_tables,
|
||||||
@ -72,12 +72,12 @@ def upload_file_to_document_store(
|
|||||||
remove_header_footer=remove_header_footer,
|
remove_header_footer=remove_header_footer,
|
||||||
valid_languages=valid_languages,
|
valid_languages=valid_languages,
|
||||||
)
|
)
|
||||||
pages = txt_converter.extract_pages(file_path)
|
document = txt_converter.convert(file_path)
|
||||||
else:
|
else:
|
||||||
raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.")
|
raise HTTPException(status_code=415, detail=f"Only .pdf and .txt file formats are supported.")
|
||||||
|
|
||||||
document = {TEXT_FIELD_NAME: "\n".join(pages), "name": file.filename}
|
document_to_write = {TEXT_FIELD_NAME: document["text"], "name": file.filename}
|
||||||
document_store.write_documents([document])
|
document_store.write_documents([document_to_write])
|
||||||
return "File upload was successful."
|
return "File upload was successful."
|
||||||
finally:
|
finally:
|
||||||
file.file.close()
|
file.file.close()
|
||||||
|
@ -3,8 +3,7 @@ from pathlib import Path
|
|||||||
from haystack.file_converter.docx import DocxToTextConverter
|
from haystack.file_converter.docx import DocxToTextConverter
|
||||||
|
|
||||||
|
|
||||||
def test_extract_pages():
|
def test_convert():
|
||||||
converter = DocxToTextConverter()
|
converter = DocxToTextConverter()
|
||||||
paragraphs, _ = converter.extract_pages(file_path=Path("samples/docx/sample_docx.docx"))
|
document = converter.convert(file_path=Path("samples/docx/sample_docx.docx"))
|
||||||
assert len(paragraphs) == 8 # Sample has 8 Paragraphs
|
assert document["text"].startswith("Sample Docx File")
|
||||||
assert paragraphs[1] == 'The US has "passed the peak" on new coronavirus cases, President Donald Trump said and predicted that some states would reopen this month.'
|
|
||||||
|
@ -7,9 +7,10 @@ from haystack.file_converter.tika import TikaConverter
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
|
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
|
||||||
def test_extract_pages(Converter, xpdf_fixture):
|
def test_convert(Converter, xpdf_fixture):
|
||||||
converter = Converter()
|
converter = Converter()
|
||||||
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||||
|
pages = document["text"].split("\f")
|
||||||
assert len(pages) == 4 # the sample PDF file has four pages.
|
assert len(pages) == 4 # the sample PDF file has four pages.
|
||||||
assert pages[0] != "" # the page 1 of PDF contains text.
|
assert pages[0] != "" # the page 1 of PDF contains text.
|
||||||
assert pages[2] == "" # the page 3 of PDF file is empty.
|
assert pages[2] == "" # the page 3 of PDF file is empty.
|
||||||
@ -18,8 +19,8 @@ def test_extract_pages(Converter, xpdf_fixture):
|
|||||||
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
|
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
|
||||||
def test_table_removal(Converter, xpdf_fixture):
|
def test_table_removal(Converter, xpdf_fixture):
|
||||||
converter = Converter(remove_numeric_tables=True)
|
converter = Converter(remove_numeric_tables=True)
|
||||||
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||||
|
pages = document["text"].split("\f")
|
||||||
# assert numeric rows are removed from the table.
|
# assert numeric rows are removed from the table.
|
||||||
assert "324" not in pages[0]
|
assert "324" not in pages[0]
|
||||||
assert "54x growth" not in pages[0]
|
assert "54x growth" not in pages[0]
|
||||||
@ -31,11 +32,11 @@ def test_table_removal(Converter, xpdf_fixture):
|
|||||||
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
|
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
|
||||||
def test_language_validation(Converter, xpdf_fixture, caplog):
|
def test_language_validation(Converter, xpdf_fixture, caplog):
|
||||||
converter = Converter(valid_languages=["en"])
|
converter = Converter(valid_languages=["en"])
|
||||||
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||||
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text
|
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text
|
||||||
|
|
||||||
converter = Converter(valid_languages=["de"])
|
converter = Converter(valid_languages=["de"])
|
||||||
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
|
||||||
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text
|
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text
|
||||||
|
|
||||||
|
|
||||||
@ -44,12 +45,15 @@ def test_header_footer_removal(Converter, xpdf_fixture):
|
|||||||
converter = Converter(remove_header_footer=True)
|
converter = Converter(remove_header_footer=True)
|
||||||
converter_no_removal = Converter(remove_header_footer=False)
|
converter_no_removal = Converter(remove_header_footer=False)
|
||||||
|
|
||||||
pages1, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
|
document1 = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
|
||||||
pages2, _ = converter_no_removal.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
|
document2 = converter_no_removal.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
|
||||||
|
pages1 = document1["text"].split("\f")
|
||||||
|
pages2 = document2["text"].split("\f")
|
||||||
for p1, p2 in zip(pages1, pages2):
|
for p1, p2 in zip(pages1, pages2):
|
||||||
assert p2 == p2
|
assert p2 == p2
|
||||||
|
|
||||||
pages, _ = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_2.pdf")) # file contains header and footer
|
document = converter.convert(file_path=Path("samples/pdf/sample_pdf_2.pdf")) # file contains header and footer
|
||||||
|
pages = document["text"].split("\f")
|
||||||
assert len(pages) == 4
|
assert len(pages) == 4
|
||||||
for page in pages:
|
for page in pages:
|
||||||
assert "This is a header." not in page
|
assert "This is a header." not in page
|
||||||
|
Loading…
x
Reference in New Issue
Block a user