Standardize TextFileToDocument (#6232)

* simplify textfiletodocument

* fix error handling and tests

* stray print

* reno

* streams->sources

* reno

* feedback

* test

* fix tests
This commit is contained in:
ZanSara 2023-11-17 14:39:39 +00:00 committed by GitHub
parent c26a932423
commit e888852aec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 76 additions and 402 deletions

View File

@ -1,15 +1,9 @@
import logging
from pathlib import Path
from typing import Optional, List, Union, Dict
from typing import List, Union
from canals.errors import PipelineRuntimeError
from tqdm import tqdm
from haystack.preview.lazy_imports import LazyImport
from haystack.preview import Document, component
with LazyImport("Run 'pip install langdetect'") as langdetect_import:
import langdetect
from haystack.preview.dataclasses import ByteStream
logger = logging.getLogger(__name__)
@ -21,189 +15,42 @@ class TextFileToDocument:
A component for converting a text file to a Document.
"""
def __init__(
self,
encoding: str = "utf-8",
remove_numeric_tables: bool = False,
numeric_row_threshold: float = 0.4,
valid_languages: Optional[List[str]] = None,
progress_bar: bool = True,
):
def __init__(self, encoding: str = "utf-8"):
"""
Create a TextFileToDocument component.
:param encoding: The encoding of the text files. Default: `"utf-8"`
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for reader models if they don't have table parsing
capability for finding answers. However, tables may also have long strings that could be possible candidates
for answers. The rows containing strings are thus retained in this option. Default: `False`
:param numeric_row_threshold: Applicable if `remove_numeric_tables` is set to `True`. This is the threshold to
determine if a line in the provided text file is a numeric table row or not. The value is the ratio of
numeric words to the total number of words in a line. Default: `0.4`
:param valid_languages: Validate languages from a list of languages specified in the
[ISO 639-1 format]((https://en.wikipedia.org/wiki/ISO_639-1)). This option can be used to add a test for
encoding errors. If the extracted text is not one of the valid languages, then there might be an encoding
error resulting in garbled text. Default: `None`
:param progress_bar: Whether to show a progress bar for the conversion process. Default: `True`
:param encoding: The default encoding of the text files. Default: `"utf-8"`.
Note that if the encoding is specified in the metadata of a ByteStream,
it will override this default.
"""
langdetect_import.check()
self.encoding = encoding
self.remove_numeric_tables = remove_numeric_tables
self.numeric_row_threshold = numeric_row_threshold
self.valid_languages = valid_languages or []
self.progress_bar = progress_bar
@component.output_types(documents=List[Document])
def run(
self,
paths: List[Union[str, Path]],
metadata: Optional[Union[Dict, List[Dict]]] = None,
encoding: Optional[str] = None,
remove_numeric_tables: Optional[bool] = None,
numeric_row_threshold: Optional[float] = None,
valid_languages: Optional[List[str]] = None,
progress_bar: Optional[bool] = None,
):
def run(self, sources: List[Union[str, Path, ByteStream]]):
"""
Convert text files to Documents.
:param paths: A list of paths to text files.
:param metadata: Optional metadata to attach to the Documents. If a list is provided, the length of the list
must match the number of paths. Default: `None`
:param encoding: The encoding of the text files. Default: `"utf-8"`
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for reader models if they don't have table parsing
capability for finding answers. However, tables may also have long strings that could be possible candidates
for answers. The rows containing strings are thus retained in this option. Default: `False`
:param numeric_row_threshold: Applicable if `remove_numeric_tables` is set to `True`. This is the threshold to
determine if a line in the provided text file is a numeric table row or not. The value is the ratio of
numeric words to the total number of words in a line. Default: `0.4`
:param valid_languages: Validate languages from a list of languages specified in the
[ISO 639-1 format]((https://en.wikipedia.org/wiki/ISO_639-1)). This option can be used to add a test for
encoding errors. If the extracted text is not one of the valid languages, then there might be an encoding
error resulting in garbled text. Default: `None`
:param progress_bar: Whether to show a progress bar for the conversion process. Default: `True`
:param streams: A list of paths to text files or ByteStream objects.
Note that if an encoding is specified in the metadata of a ByteStream,
it will override the component's default.
:return: A dictionary containing the converted documents.
"""
if encoding is None:
encoding = self.encoding
if remove_numeric_tables is None:
remove_numeric_tables = self.remove_numeric_tables
if numeric_row_threshold is None:
numeric_row_threshold = self.numeric_row_threshold
if valid_languages is None:
valid_languages = self.valid_languages
if progress_bar is None:
progress_bar = self.progress_bar
metas = TextFileToDocument._prepare_metadata(metadata, paths)
documents = []
for path, meta in tqdm(
zip(paths, metas), total=len(paths), desc="Converting text files", disable=not progress_bar
):
for source in sources:
if isinstance(source, (Path, str)):
try:
path = source
source = ByteStream.from_file_path(Path(source))
source.metadata["file_path"] = str(path)
except Exception as e:
logger.warning("Could not convert file %s. Skipping it. Error message: %s", source, e)
continue
try:
text = self._read_and_clean_file(
path=path, encoding=encoding, remove_numeric_tables=remove_numeric_tables
)
encoding = source.metadata.get("encoding", self.encoding)
document = Document(content=source.data.decode(encoding))
document.meta = source.metadata
documents.append(document)
except Exception as e:
logger.warning("Could not read file %s. Skipping it. Error message: %s", path, e)
continue
if valid_languages is not None and not TextFileToDocument._validate_language(text, valid_languages):
logger.warning(
"Text from file %s is not in one of the valid languages: %s. "
"The file may have been decoded incorrectly.",
path,
valid_languages,
)
document = Document(content=text, meta=meta)
documents.append(document)
logger.warning("Could not convert file %s. Skipping it. Error message: %s", source, e)
return {"documents": documents}
@staticmethod
def _prepare_metadata(metadata: Optional[Union[Dict, List[Dict]]], paths: List[Union[str, Path]]) -> List[Dict]:
"""
Prepare the metadata for the Documents.
:param metadata: The metadata for the Documents.
:param paths: The paths to the text files.
"""
if metadata is None:
return [{"file_path": str(path)} for path in paths]
if isinstance(metadata, dict):
metadata = [metadata] * len(paths)
if len(metadata) != len(paths):
raise PipelineRuntimeError(
f"The number of metadata entries must match the number of paths if metadata is a list. "
f"Number of paths: {len(paths)}, number of metadata entries: {len(metadata)}."
)
return [{**m, "file_path": m.get("file_path", str(path))} for m, path in zip(metadata, paths)]
def _read_and_clean_file(self, path: Union[str, Path], encoding: str, remove_numeric_tables: bool) -> str:
"""
Read and clean the text file.
:param path: The path to the text file.
:param encoding: The encoding of the text file.
:param remove_numeric_tables: Whether to remove numeric tables.
:return: The text of the file cleaned from numeric tables if `remove_numeric_tables` is `True`.
"""
if not Path(path).exists():
raise PipelineRuntimeError(f"File at path {path} does not exist.")
with open(path, encoding=encoding) as file:
text = file.read()
pages = text.split("\f")
cleaned_pages = [self._clean_page(page, remove_numeric_tables) for page in pages]
return "\f".join(cleaned_pages)
def _clean_page(self, page: str, remove_numeric_tables: bool) -> str:
"""
Clean a page of text from numeric tables if `remove_numeric_tables` is `True`.
:param page: The content of a page of a text file.
:param remove_numeric_tables: Whether to remove numeric tables.
:return: The text from the page cleaned from numeric tables if `remove_numeric_tables` is `True`.
"""
cleaned_lines = page.splitlines()
if remove_numeric_tables:
cleaned_lines = [line for line in cleaned_lines if not self._is_numeric_row(line)]
return "\n".join(cleaned_lines)
def _is_numeric_row(self, line: str) -> bool:
"""
Check if a line of a text file is a numeric row. A line is considered a numeric row if it contains more
than 40% digits and does not end with a period.
:param line: The content of a line of a text file.
"""
words = line.split()
digits = [word for word in words if any(char.isdigit() for char in word)]
return len(digits) / len(words) > self.numeric_row_threshold and not line.strip().endswith(".")
@staticmethod
def _validate_language(text: str, valid_languages: List[str]) -> bool:
"""
Validate if the detected language of the text is one of the valid languages.
:param text: The text to validate.
:param valid_languages: A list of valid languages.
"""
if not valid_languages:
return True
try:
lang = langdetect.detect(text)
except langdetect.lang_detect_exception.LangDetectException:
lang = None
return lang in valid_languages

View File

@ -0,0 +1,3 @@
preview:
- Remove most parameters from TextFileToDocument to make it match all other converters.
- Add support for ByteStreams

View File

@ -4,253 +4,66 @@ from pathlib import Path
import pytest
from canals.errors import PipelineRuntimeError
from langdetect import LangDetectException
from haystack.preview.dataclasses import ByteStream
from haystack.preview.components.file_converters.txt import TextFileToDocument
class TestTextfileToDocument: # pylint: disable=R0904
class TestTextfileToDocument:
@pytest.mark.unit
def test_run(self, preview_samples_path):
"""
Test if the component runs correctly.
"""
paths = [preview_samples_path / "txt" / "doc_1.txt", preview_samples_path / "txt" / "doc_2.txt"]
bytestream = ByteStream.from_file_path(preview_samples_path / "txt" / "doc_3.txt")
bytestream.metadata["file_path"] = str(preview_samples_path / "txt" / "doc_3.txt")
bytestream.metadata["key"] = "value"
files = [
str(preview_samples_path / "txt" / "doc_1.txt"),
preview_samples_path / "txt" / "doc_2.txt",
bytestream,
]
converter = TextFileToDocument()
output = converter.run(paths=paths)
output = converter.run(sources=files)
docs = output["documents"]
assert len(docs) == 2
assert docs[0].content == "Some text for testing.\nTwo lines in here."
assert docs[1].content == "This is a test line.\n123 456 789\n987 654 321."
assert docs[0].meta["file_path"] == str(paths[0])
assert docs[1].meta["file_path"] == str(paths[1])
@pytest.mark.unit
def test_run_warning_for_invalid_language(self, preview_samples_path, caplog):
file_path = preview_samples_path / "txt" / "doc_1.txt"
converter = TextFileToDocument()
with patch(
"haystack.preview.components.file_converters.txt.langdetect.detect", return_value="en"
), caplog.at_level(logging.WARNING):
output = converter.run(paths=[file_path], valid_languages=["de"])
assert (
f"Text from file {file_path} is not in one of the valid languages: ['de']. "
f"The file may have been decoded incorrectly." in caplog.text
)
docs = output["documents"]
assert len(docs) == 1
assert docs[0].content == "Some text for testing.\nTwo lines in here."
assert len(docs) == 3
assert "Some text for testing." in docs[0].content
assert "This is a test line." in docs[1].content
assert "That's yet another file!" in docs[2].content
assert docs[0].meta["file_path"] == str(files[0])
assert docs[1].meta["file_path"] == str(files[1])
assert docs[2].meta == bytestream.metadata
@pytest.mark.unit
def test_run_error_handling(self, preview_samples_path, caplog):
"""
Test if the component correctly handles errors.
"""
paths = [preview_samples_path / "txt" / "doc_1.txt", "non_existing_file.txt"]
paths = [
preview_samples_path / "txt" / "doc_1.txt",
"non_existing_file.txt",
preview_samples_path / "txt" / "doc_3.txt",
]
converter = TextFileToDocument()
with caplog.at_level(logging.WARNING):
output = converter.run(paths=paths)
assert (
"Could not read file non_existing_file.txt. Skipping it. Error message: File at path non_existing_file.txt does not exist."
in caplog.text
)
output = converter.run(sources=paths)
assert "non_existing_file.txt" in caplog.text
docs = output["documents"]
assert len(docs) == 1
assert len(docs) == 2
assert docs[0].meta["file_path"] == str(paths[0])
assert docs[1].meta["file_path"] == str(paths[2])
@pytest.mark.unit
def test_prepare_metadata_no_metadata(self):
def test_encoding_override(self, preview_samples_path):
"""
Test if the metadata is correctly prepared when no custom metadata is provided.
Test if the encoding metadata field is used properly
"""
converter = TextFileToDocument()
meta = converter._prepare_metadata(
metadata=None, paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")]
)
assert len(meta) == 2
assert meta[0]["file_path"] == "data/sample_path_1.txt"
assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt"))
bytestream = ByteStream.from_file_path(preview_samples_path / "txt" / "doc_1.txt")
bytestream.metadata["key"] = "value"
@pytest.mark.unit
def test_prepare_metadata_single_dict(self):
"""
Test if the metadata is correctly prepared when a single dict is provided.
"""
converter = TextFileToDocument()
meta = converter._prepare_metadata(
metadata={"name": "test"}, paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")]
)
assert len(meta) == 2
assert meta[0]["file_path"] == "data/sample_path_1.txt"
assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt"))
assert meta[0]["name"] == "test"
assert meta[1]["name"] == "test"
converter = TextFileToDocument(encoding="utf-16")
output = converter.run(sources=[bytestream])
assert "Some text for testing." not in output["documents"][0].content
@pytest.mark.unit
def test_prepare_metadata_list_of_dicts(self):
"""
Test if the metadata is correctly prepared when a list of dicts is provided.
"""
converter = TextFileToDocument()
meta = converter._prepare_metadata(
metadata=[{"name": "test1"}, {"name": "test2"}],
paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")],
)
assert len(meta) == 2
assert meta[0]["file_path"] == "data/sample_path_1.txt"
assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt"))
assert meta[0]["name"] == "test1"
assert meta[1]["name"] == "test2"
@pytest.mark.unit
def test_prepare_metadata_unmatching_list_len(self):
"""
Test if an error is raised when the number of metadata dicts is not equal to the number of
file paths.
"""
converter = TextFileToDocument()
with pytest.raises(
PipelineRuntimeError,
match="The number of metadata entries must match the number of paths if metadata is a list.",
):
converter._prepare_metadata(
metadata=[{"name": "test1"}, {"name": "test2"}],
paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt"), "data/sample_path_3.txt"],
)
@pytest.mark.unit
def test_read_and_clean_file(self, preview_samples_path):
"""
Test if the file is correctly read.
"""
file_path = preview_samples_path / "txt" / "doc_1.txt"
converter = TextFileToDocument()
text = converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=False)
assert text == "Some text for testing.\nTwo lines in here."
@pytest.mark.unit
def test_read_and_clean_file_non_existing_file(self):
"""
Test if an error is raised when the file does not exist.
"""
converter = TextFileToDocument()
file_path = "non_existing_file.txt"
with pytest.raises(PipelineRuntimeError, match=f"File at path {file_path} does not exist."):
converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=False)
@pytest.mark.unit
def test_read_and_clean_file_remove_numeric_tables(self, preview_samples_path):
"""
Test if the file is correctly read and numeric tables are removed.
"""
file_path = preview_samples_path / "txt" / "doc_2.txt"
converter = TextFileToDocument()
text = converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=True)
assert text == "This is a test line.\n987 654 321."
@pytest.mark.unit
def test_clean_page_without_remove_numeric_tables(self):
"""
Test if the page is not changed when remove_numeric_tables is False.
"""
converter = TextFileToDocument()
page = "This is a test line.\n123 456 789"
cleaned_page = converter._clean_page(page=page, remove_numeric_tables=False)
assert cleaned_page == page
@pytest.mark.unit
def test_clean_page_with_remove_numeric_tables(self):
"""
Test if the page is correctly cleaned when remove_numeric_tables is True.
"""
converter = TextFileToDocument()
page = "This is a test line.\n123 456 789"
cleaned_page = converter._clean_page(page=page, remove_numeric_tables=True)
assert cleaned_page == "This is a test line."
@pytest.mark.unit
def test_is_numeric_row_only_numbers(self):
"""
Test if the line is correctly identified as a numeric row when it only contains numbers.
"""
converter = TextFileToDocument()
line = "123 456 789"
assert converter._is_numeric_row(line=line)
@pytest.mark.unit
def test_is_numeric_row_only_text(self):
"""
Test if the line is correctly identified as a non-numeric row when it only contains text.
"""
converter = TextFileToDocument()
line = "This is a test line."
assert not converter._is_numeric_row(line=line)
@pytest.mark.unit
def test_is_numeric_row_only_numbers_with_period(self):
"""
Test if the line is correctly identified as a non-numeric row when it only contains numbers and a period at
the end.
"""
converter = TextFileToDocument()
line = "123 456 789."
assert not converter._is_numeric_row(line=line)
@pytest.mark.unit
def test_is_numeric_row_more_numbers_than_text(self):
"""
Test if the line is correctly identified as a numeric row when it consists of more than 40% of numbers than.
"""
converter = TextFileToDocument()
line = "123 456 789 This is a test"
assert converter._is_numeric_row(line=line)
@pytest.mark.unit
def test_is_numeric_row_less_numbers_than_text(self):
"""
Test if the line is correctly identified as a non-numeric row when it consists of less than 40% of numbers than.
"""
converter = TextFileToDocument()
line = "123 456 789 This is a test line"
assert not converter._is_numeric_row(line=line)
@pytest.mark.unit
def test_is_numeric_row_words_consist_of_numbers_and_text(self):
"""
Test if the line is correctly identified as a numeric row when the words consist of numbers and text.
"""
converter = TextFileToDocument()
line = "123eur 456usd"
assert converter._is_numeric_row(line=line)
@pytest.mark.unit
def test_validate_language(self):
"""
Test if the language is correctly validated.
"""
converter = TextFileToDocument()
with patch("haystack.preview.components.file_converters.txt.langdetect.detect", return_value="en"):
assert converter._validate_language(text="This is an english text.", valid_languages=["en"])
assert not converter._validate_language(text="This is an english text.", valid_languages=["de"])
@pytest.mark.unit
def test_validate_language_no_languages_specified(self):
"""
Test if _validate_languages returns True when no languages are specified.
"""
converter = TextFileToDocument()
assert converter._validate_language(text="This is an english test.", valid_languages=[])
@pytest.mark.unit
def test_validate_language_lang_detect_exception(self):
"""
Test if _validate_languages returns False when langdetect throws an exception.
"""
converter = TextFileToDocument()
with patch(
"haystack.preview.components.file_converters.txt.langdetect.detect",
side_effect=LangDetectException(code=0, message="Test"),
):
assert not converter._validate_language(text="This is an english text.", valid_languages=["en"])
bytestream.metadata["encoding"] = "utf-8"
output = converter.run(sources=[bytestream])
assert "Some text for testing." in output["documents"][0].content

View File

@ -0,0 +1,11 @@
That's yet another file!
it contains
many
empty lines.