feat: Add TextFileToDocument component (v2) (#5467)

* Add TextfileToDocument component

* Add docstrings

* Add unit tests

* Add release note file

* Make use of progress bar

* Add TextfileToDocument to __init__.py

* Use lazy % formatting in logging functions

* Remove f from non-f-string

* Add TextfileToDocument to __init__.py

* Use correct dependency extra

* Compare file path against path object

* PR feedback

* PR feedback

* Update haystack/preview/components/file_converters/txt.py

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>

* Update docstrings

* Add error handling

* Add unit test

* Reintroduce falsely removed caplog

---------

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
This commit is contained in:
bogdankostic 2023-08-01 11:34:52 +02:00 committed by GitHub
parent 8920fd6939
commit a51ca19fe4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 512 additions and 0 deletions

View File

@ -1,2 +1,3 @@
from haystack.preview.components.audio.whisper_local import LocalWhisperTranscriber
from haystack.preview.components.audio.whisper_remote import RemoteWhisperTranscriber
from haystack.preview.components.file_converters import TextFileToDocument

View File

@ -0,0 +1 @@
from haystack.preview.components.file_converters.txt import TextFileToDocument

View File

@ -0,0 +1,245 @@
import logging
from pathlib import Path
from typing import Optional, List, Union, Dict
from canals.errors import PipelineRuntimeError
from tqdm import tqdm
from haystack import Document
from haystack.lazy_imports import LazyImport
from haystack.preview import component
with LazyImport("Run 'pip install farm-haystack[preprocessing]'") as langdetect_import:
import langdetect
logger = logging.getLogger(__name__)
@component
class TextFileToDocument:
"""
A component for converting a text file to a Document.
"""
@component.input
def input(self):
class Input:
"""
Input data for the TextFileToDocument component.
:param paths: A list of paths to text files.
:param meta: Optional metadata to attach to the Documents. If a list is provided, the length of the list
must match the number of paths.
Default: `None`
:param encoding: The encoding of the text files. Default: `"utf-8"`
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for reader models if they
don't have table parsing capability for finding answers. However, tables
may also have long strings that could be possible candidates for answers.
The rows containing strings are thus retained in this option.
Default: `False`
:param valid_languages: Validate languages from a list of languages specified in the [ISO 639-1 format]((https://en.wikipedia.org/wiki/ISO_639-1)).
This option can be used to add a test for encoding errors. If the extracted text is
not one of the valid languages, then there might be an encoding error resulting
in garbled text.
Default: `None`
:param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
attributes. If you want to ensure you don't have duplicate Documents in your
DocumentStore but texts are not unique, you can modify the metadata and pass e.g.
`"meta"` to this field (for example `["content", "meta"]`).
In this case the ID will be generated by using the content and the defined metadata.
Default: `None`
:param progress_bar: Whether to show a progress bar for the conversion process.
Default: `True`
"""
paths: List[Union[str, Path]]
meta: Optional[Union[Dict, List[Dict]]]
encoding: Optional[str]
remove_numeric_tables: Optional[bool]
valid_languages: Optional[List[str]]
id_hash_keys: Optional[List[str]]
progress_bar: Optional[bool]
return Input
@component.output
def output(self):
class Output:
"""
Output data from the TextFileToDocument component.
:param documents: The converted documents.
"""
documents: List[Document]
return Output
def __init__(
self,
encoding: str = "utf-8",
remove_numeric_tables: bool = False,
numeric_row_threshold: float = 0.4,
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None,
progress_bar: bool = True,
):
"""
Create a TextFileToDocument component.
:param encoding: The encoding of the text files. Default: `"utf-8"`
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for reader models if they
don't have table parsing capability for finding answers. However, tables
may also have long strings that could be possible candidates for answers.
The rows containing strings are thus retained in this option.
Default: `False`
:param numeric_row_threshold: Applicable if `remove_numeric_tables` is set to `True`. This is the threshold to
determine if a line in the provided text file is a numeric table row or not.
The value is the ratio of numeric words to the total number of words in a line.
:param valid_languages: Validate languages from a list of languages specified in the [ISO 639-1 format]((https://en.wikipedia.org/wiki/ISO_639-1)).
This option can be used to add a test for encoding errors. If the extracted text is
not one of the valid languages, then there might be an encoding error resulting
in garbled text.
Default: `None`
:param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
attributes. If you want to ensure you don't have duplicate Documents in your DocumentStore
but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field
(for example `["content", "meta"]`). In this case the ID will be generated by using the
content and the defined metadata. Default: `None`
:param progress_bar: Whether to show a progress bar for the conversion process.
Default: `True`
"""
langdetect_import.check()
self.defaults = {
"encoding": encoding,
"remove_numeric_tables": remove_numeric_tables,
"valid_languages": valid_languages,
"id_hash_keys": id_hash_keys,
"progress_bar": progress_bar,
}
self.numeric_row_threshold = numeric_row_threshold
def run(self, data):
"""
Convert text files to Documents.
:param data: Input data for the TextFileToDocument component.
"""
file_paths = data.paths
metas = TextFileToDocument._prepare_metadata(data.meta, file_paths)
documents = []
for path, meta in tqdm(
zip(file_paths, metas), total=len(file_paths), desc="Converting text files", disable=not data.progress_bar
):
try:
text = self._read_and_clean_file(
path=path, encoding=data.encoding, remove_numeric_tables=data.remove_numeric_tables
)
except Exception as e:
logger.warning("Could not read file %s. Skipping it. Error message: %s", path, e)
continue
if data.valid_languages is not None and not TextFileToDocument._validate_language(
text, data.valid_languages
):
logger.warning(
"Text from file %s is not in one of the valid languages: %s. "
"The file may have been decoded incorrectly.",
path,
data.valid_languages,
)
document = Document(content=text, meta=meta, id_hash_keys=data.id_hash_keys)
documents.append(document)
return self.output(documents=documents)
@staticmethod
def _prepare_metadata(meta: Optional[Union[Dict, List[Dict]]], file_paths: List[Union[str, Path]]) -> List[Dict]:
"""
Prepare the metadata for the Documents.
:param meta: The metadata for the Documents.
:param file_paths: The paths to the text files.
"""
if meta is None:
return [{"file_path": str(path)} for path in file_paths]
if isinstance(meta, dict):
meta = [meta] * len(file_paths)
if len(meta) != len(file_paths):
raise PipelineRuntimeError(
f"The number of meta entries must match the number of paths if meta is a list. "
f"Number of paths: {len(file_paths)}, number of meta entries: {len(meta)}."
)
return [{**m, "file_path": m.get("file_path", str(path))} for m, path in zip(meta, file_paths)]
def _read_and_clean_file(self, path: Union[str, Path], encoding: str, remove_numeric_tables: bool) -> str:
"""
Read and clean the text file.
:param path: The path to the text file.
:param encoding: The encoding of the text file.
:param remove_numeric_tables: Whether to remove numeric tables.
:return: The text of the file cleaned from numeric tables if `remove_numeric_tables` is `True`.
"""
if not Path(path).exists():
raise PipelineRuntimeError(f"File at path {path} does not exist.")
with open(path, encoding=encoding) as file:
text = file.read()
pages = text.split("\f")
cleaned_pages = [self._clean_page(page, remove_numeric_tables) for page in pages]
return "\f".join(cleaned_pages)
def _clean_page(self, page: str, remove_numeric_tables: bool) -> str:
"""
Clean a page of text from numeric tables if `remove_numeric_tables` is `True`.
:param page: The content of a page of a text file.
:param remove_numeric_tables: Whether to remove numeric tables.
:return: The text from the page cleaned from numeric tables if `remove_numeric_tables` is `True`.
"""
cleaned_lines = page.splitlines()
if remove_numeric_tables:
cleaned_lines = [line for line in cleaned_lines if not self._is_numeric_row(line)]
return "\n".join(cleaned_lines)
def _is_numeric_row(self, line: str) -> bool:
"""
Check if a line of a text file is a numeric row. A line is considered a numeric row if it contains more
than 40% digits and does not end with a period.
:param line: The content of a line of a text file.
"""
words = line.split()
digits = [word for word in words if any(char.isdigit() for char in word)]
return len(digits) / len(words) > self.numeric_row_threshold and not line.strip().endswith(".")
@staticmethod
def _validate_language(text: str, valid_languages: List[str]) -> bool:
"""
Validate if the detected language of the text is one of the valid languages.
:param text: The text to validate.
:param valid_languages: A list of valid languages.
"""
if not valid_languages:
return True
try:
lang = langdetect.detect(text)
except langdetect.lang_detect_exception.LangDetectException:
lang = None
return lang in valid_languages

View File

@ -0,0 +1,4 @@
---
features:
- |
Add new TextFileToDocument component to Haystack v2 preview so that text files can be converted to Haystack Documents.

View File

@ -0,0 +1,256 @@
import logging
from unittest.mock import patch
import pytest
from pathlib import Path
from canals.errors import PipelineRuntimeError
from langdetect import LangDetectException
from haystack.preview.components.file_converters.txt import TextFileToDocument
from test.preview.components.base import BaseTestComponent
from test.conftest import preview_samples_path
class TestTextfileToDocument(BaseTestComponent):
@pytest.mark.unit
def test_run(self, preview_samples_path):
"""
Test if the component runs correctly.
"""
file_paths = [preview_samples_path / "txt" / "doc_1.txt", preview_samples_path / "txt" / "doc_2.txt"]
converter = TextFileToDocument()
output = converter.run(data=converter.input(paths=file_paths))
docs = output.documents
assert len(docs) == 2
assert docs[0].content == "Some text for testing.\nTwo lines in here."
assert docs[1].content == "This is a test line.\n123 456 789\n987 654 321."
assert docs[0].meta["file_path"] == str(file_paths[0])
assert docs[1].meta["file_path"] == str(file_paths[1])
@pytest.mark.unit
def test_run_warning_for_invalid_language(self, preview_samples_path, caplog):
file_path = preview_samples_path / "txt" / "doc_1.txt"
converter = TextFileToDocument()
with patch("haystack.preview.components.file_converters.txt.langdetect.detect", return_value="en"):
with caplog.at_level(logging.WARNING):
output = converter.run(data=converter.input(paths=[file_path], valid_languages=["de"]))
assert (
f"Text from file {file_path} is not in one of the valid languages: ['de']. "
f"The file may have been decoded incorrectly." in caplog.text
)
docs = output.documents
assert len(docs) == 1
assert docs[0].content == "Some text for testing.\nTwo lines in here."
@pytest.mark.unit
def test_run_error_handling(self, preview_samples_path, caplog):
"""
Test if the component correctly handles errors.
"""
file_paths = [preview_samples_path / "txt" / "doc_1.txt", "non_existing_file.txt"]
converter = TextFileToDocument()
with caplog.at_level(logging.WARNING):
output = converter.run(data=converter.input(paths=file_paths))
assert (
"Could not read file non_existing_file.txt. Skipping it. Error message: File at path non_existing_file.txt does not exist."
in caplog.text
)
docs = output.documents
assert len(docs) == 1
assert docs[0].meta["file_path"] == str(file_paths[0])
@pytest.mark.unit
def test_prepare_metadata_no_metadata(self):
"""
Test if the metadata is correctly prepared when no custom metadata is provided.
"""
converter = TextFileToDocument()
meta = converter._prepare_metadata(
meta=None, file_paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")]
)
assert len(meta) == 2
assert meta[0]["file_path"] == "data/sample_path_1.txt"
assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt"))
@pytest.mark.unit
def test_prepare_metadata_single_dict(self):
"""
Test if the metadata is correctly prepared when a single dict is provided.
"""
converter = TextFileToDocument()
meta = converter._prepare_metadata(
meta={"name": "test"}, file_paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")]
)
assert len(meta) == 2
assert meta[0]["file_path"] == "data/sample_path_1.txt"
assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt"))
assert meta[0]["name"] == "test"
assert meta[1]["name"] == "test"
@pytest.mark.unit
def test_prepare_metadata_list_of_dicts(self):
"""
Test if the metadata is correctly prepared when a list of dicts is provided.
"""
converter = TextFileToDocument()
meta = converter._prepare_metadata(
meta=[{"name": "test1"}, {"name": "test2"}],
file_paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")],
)
assert len(meta) == 2
assert meta[0]["file_path"] == "data/sample_path_1.txt"
assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt"))
assert meta[0]["name"] == "test1"
assert meta[1]["name"] == "test2"
@pytest.mark.unit
def test_prepare_metadata_unmatching_list_len(self):
"""
Test if an error is raised when the number of metadata dicts is not equal to the number of
file paths.
"""
converter = TextFileToDocument()
with pytest.raises(
PipelineRuntimeError, match="The number of meta entries must match the number of paths if meta is a list."
):
converter._prepare_metadata(
meta=[{"name": "test1"}, {"name": "test2"}],
file_paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt"), "data/sample_path_3.txt"],
)
@pytest.mark.unit
def test_read_and_clean_file(self, preview_samples_path):
"""
Test if the file is correctly read.
"""
file_path = preview_samples_path / "txt" / "doc_1.txt"
converter = TextFileToDocument()
text = converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=False)
assert text == "Some text for testing.\nTwo lines in here."
@pytest.mark.unit
def test_read_and_clean_file_non_existing_file(self):
"""
Test if an error is raised when the file does not exist.
"""
converter = TextFileToDocument()
file_path = "non_existing_file.txt"
with pytest.raises(PipelineRuntimeError, match=f"File at path {file_path} does not exist."):
converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=False)
@pytest.mark.unit
def test_read_and_clean_file_remove_numeric_tables(self, preview_samples_path):
"""
Test if the file is correctly read and numeric tables are removed.
"""
file_path = preview_samples_path / "txt" / "doc_2.txt"
converter = TextFileToDocument()
text = converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=True)
assert text == "This is a test line.\n987 654 321."
@pytest.mark.unit
def test_clean_page_without_remove_numeric_tables(self):
"""
Test if the page is not changed when remove_numeric_tables is False.
"""
converter = TextFileToDocument()
page = "This is a test line.\n123 456 789"
cleaned_page = converter._clean_page(page=page, remove_numeric_tables=False)
assert cleaned_page == page
@pytest.mark.unit
def test_clean_page_with_remove_numeric_tables(self):
"""
Test if the page is correctly cleaned when remove_numeric_tables is True.
"""
converter = TextFileToDocument()
page = "This is a test line.\n123 456 789"
cleaned_page = converter._clean_page(page=page, remove_numeric_tables=True)
assert cleaned_page == "This is a test line."
@pytest.mark.unit
def test_is_numeric_row_only_numbers(self):
"""
Test if the line is correctly identified as a numeric row when it only contains numbers.
"""
converter = TextFileToDocument()
line = "123 456 789"
assert converter._is_numeric_row(line=line)
@pytest.mark.unit
def test_is_numeric_row_only_text(self):
"""
Test if the line is correctly identified as a non-numeric row when it only contains text.
"""
converter = TextFileToDocument()
line = "This is a test line."
assert not converter._is_numeric_row(line=line)
@pytest.mark.unit
def test_is_numeric_row_only_numbers_with_period(self):
"""
Test if the line is correctly identified as a non-numeric row when it only contains numbers and a period at
the end.
"""
converter = TextFileToDocument()
line = "123 456 789."
assert not converter._is_numeric_row(line=line)
@pytest.mark.unit
def test_is_numeric_row_more_numbers_than_text(self):
"""
Test if the line is correctly identified as a numeric row when it consists of more than 40% of numbers than.
"""
converter = TextFileToDocument()
line = "123 456 789 This is a test"
assert converter._is_numeric_row(line=line)
@pytest.mark.unit
def test_is_numeric_row_less_numbers_than_text(self):
"""
Test if the line is correctly identified as a non-numeric row when it consists of less than 40% of numbers than.
"""
converter = TextFileToDocument()
line = "123 456 789 This is a test line"
assert not converter._is_numeric_row(line=line)
@pytest.mark.unit
def test_is_numeric_row_words_consist_of_numbers_and_text(self):
"""
Test if the line is correctly identified as a numeric row when the words consist of numbers and text.
"""
converter = TextFileToDocument()
line = "123eur 456usd"
assert converter._is_numeric_row(line=line)
@pytest.mark.unit
def test_validate_language(self):
"""
Test if the language is correctly validated.
"""
converter = TextFileToDocument()
with patch("haystack.preview.components.file_converters.txt.langdetect.detect", return_value="en"):
assert converter._validate_language(text="This is an english text.", valid_languages=["en"])
assert not converter._validate_language(text="This is an english text.", valid_languages=["de"])
@pytest.mark.unit
def test_validate_language_no_languages_specified(self):
"""
Test if _validate_languages returns True when no languages are specified.
"""
converter = TextFileToDocument()
assert converter._validate_language(text="This is an english test.", valid_languages=[])
@pytest.mark.unit
def test_validate_language_lang_detect_exception(self):
"""
Test if _validate_languages returns False when langdetect throws an exception.
"""
converter = TextFileToDocument()
with patch(
"haystack.preview.components.file_converters.txt.langdetect.detect",
side_effect=LangDetectException(code=0, message="Test"),
):
assert not converter._validate_language(text="This is an english text.", valid_languages=["en"])

View File

@ -0,0 +1,2 @@
Some text for testing.
Two lines in here.

View File

@ -0,0 +1,3 @@
This is a test line.
123 456 789
987 654 321.