mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-29 07:59:27 +00:00
Standardize TextFileToDocument (#6232)
* simplify textfiletodocument * fix error handling and tests * stray print * reno * streams->sources * reno * feedback * test * fix tests
This commit is contained in:
parent
c26a932423
commit
e888852aec
@ -1,15 +1,9 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Union, Dict
|
||||
from typing import List, Union
|
||||
|
||||
from canals.errors import PipelineRuntimeError
|
||||
from tqdm import tqdm
|
||||
|
||||
from haystack.preview.lazy_imports import LazyImport
|
||||
from haystack.preview import Document, component
|
||||
|
||||
with LazyImport("Run 'pip install langdetect'") as langdetect_import:
|
||||
import langdetect
|
||||
from haystack.preview.dataclasses import ByteStream
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -21,189 +15,42 @@ class TextFileToDocument:
|
||||
A component for converting a text file to a Document.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
encoding: str = "utf-8",
|
||||
remove_numeric_tables: bool = False,
|
||||
numeric_row_threshold: float = 0.4,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
progress_bar: bool = True,
|
||||
):
|
||||
def __init__(self, encoding: str = "utf-8"):
|
||||
"""
|
||||
Create a TextFileToDocument component.
|
||||
|
||||
:param encoding: The encoding of the text files. Default: `"utf-8"`
|
||||
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
|
||||
The tabular structures in documents might be noise for reader models if they don't have table parsing
|
||||
capability for finding answers. However, tables may also have long strings that could be possible candidates
|
||||
for answers. The rows containing strings are thus retained in this option. Default: `False`
|
||||
:param numeric_row_threshold: Applicable if `remove_numeric_tables` is set to `True`. This is the threshold to
|
||||
determine if a line in the provided text file is a numeric table row or not. The value is the ratio of
|
||||
numeric words to the total number of words in a line. Default: `0.4`
|
||||
:param valid_languages: Validate languages from a list of languages specified in the
|
||||
[ISO 639-1 format]((https://en.wikipedia.org/wiki/ISO_639-1)). This option can be used to add a test for
|
||||
encoding errors. If the extracted text is not one of the valid languages, then there might be an encoding
|
||||
error resulting in garbled text. Default: `None`
|
||||
:param progress_bar: Whether to show a progress bar for the conversion process. Default: `True`
|
||||
:param encoding: The default encoding of the text files. Default: `"utf-8"`.
|
||||
Note that if the encoding is specified in the metadata of a ByteStream,
|
||||
it will override this default.
|
||||
"""
|
||||
langdetect_import.check()
|
||||
|
||||
self.encoding = encoding
|
||||
self.remove_numeric_tables = remove_numeric_tables
|
||||
self.numeric_row_threshold = numeric_row_threshold
|
||||
self.valid_languages = valid_languages or []
|
||||
self.progress_bar = progress_bar
|
||||
|
||||
@component.output_types(documents=List[Document])
|
||||
def run(
|
||||
self,
|
||||
paths: List[Union[str, Path]],
|
||||
metadata: Optional[Union[Dict, List[Dict]]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
numeric_row_threshold: Optional[float] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
progress_bar: Optional[bool] = None,
|
||||
):
|
||||
def run(self, sources: List[Union[str, Path, ByteStream]]):
|
||||
"""
|
||||
Convert text files to Documents.
|
||||
|
||||
:param paths: A list of paths to text files.
|
||||
:param metadata: Optional metadata to attach to the Documents. If a list is provided, the length of the list
|
||||
must match the number of paths. Default: `None`
|
||||
:param encoding: The encoding of the text files. Default: `"utf-8"`
|
||||
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
|
||||
The tabular structures in documents might be noise for reader models if they don't have table parsing
|
||||
capability for finding answers. However, tables may also have long strings that could be possible candidates
|
||||
for answers. The rows containing strings are thus retained in this option. Default: `False`
|
||||
:param numeric_row_threshold: Applicable if `remove_numeric_tables` is set to `True`. This is the threshold to
|
||||
determine if a line in the provided text file is a numeric table row or not. The value is the ratio of
|
||||
numeric words to the total number of words in a line. Default: `0.4`
|
||||
:param valid_languages: Validate languages from a list of languages specified in the
|
||||
[ISO 639-1 format]((https://en.wikipedia.org/wiki/ISO_639-1)). This option can be used to add a test for
|
||||
encoding errors. If the extracted text is not one of the valid languages, then there might be an encoding
|
||||
error resulting in garbled text. Default: `None`
|
||||
:param progress_bar: Whether to show a progress bar for the conversion process. Default: `True`
|
||||
:param streams: A list of paths to text files or ByteStream objects.
|
||||
Note that if an encoding is specified in the metadata of a ByteStream,
|
||||
it will override the component's default.
|
||||
:return: A dictionary containing the converted documents.
|
||||
"""
|
||||
if encoding is None:
|
||||
encoding = self.encoding
|
||||
if remove_numeric_tables is None:
|
||||
remove_numeric_tables = self.remove_numeric_tables
|
||||
if numeric_row_threshold is None:
|
||||
numeric_row_threshold = self.numeric_row_threshold
|
||||
if valid_languages is None:
|
||||
valid_languages = self.valid_languages
|
||||
if progress_bar is None:
|
||||
progress_bar = self.progress_bar
|
||||
|
||||
metas = TextFileToDocument._prepare_metadata(metadata, paths)
|
||||
|
||||
documents = []
|
||||
for path, meta in tqdm(
|
||||
zip(paths, metas), total=len(paths), desc="Converting text files", disable=not progress_bar
|
||||
):
|
||||
for source in sources:
|
||||
if isinstance(source, (Path, str)):
|
||||
try:
|
||||
path = source
|
||||
source = ByteStream.from_file_path(Path(source))
|
||||
source.metadata["file_path"] = str(path)
|
||||
except Exception as e:
|
||||
logger.warning("Could not convert file %s. Skipping it. Error message: %s", source, e)
|
||||
continue
|
||||
try:
|
||||
text = self._read_and_clean_file(
|
||||
path=path, encoding=encoding, remove_numeric_tables=remove_numeric_tables
|
||||
)
|
||||
encoding = source.metadata.get("encoding", self.encoding)
|
||||
document = Document(content=source.data.decode(encoding))
|
||||
document.meta = source.metadata
|
||||
documents.append(document)
|
||||
except Exception as e:
|
||||
logger.warning("Could not read file %s. Skipping it. Error message: %s", path, e)
|
||||
continue
|
||||
|
||||
if valid_languages is not None and not TextFileToDocument._validate_language(text, valid_languages):
|
||||
logger.warning(
|
||||
"Text from file %s is not in one of the valid languages: %s. "
|
||||
"The file may have been decoded incorrectly.",
|
||||
path,
|
||||
valid_languages,
|
||||
)
|
||||
|
||||
document = Document(content=text, meta=meta)
|
||||
documents.append(document)
|
||||
logger.warning("Could not convert file %s. Skipping it. Error message: %s", source, e)
|
||||
|
||||
return {"documents": documents}
|
||||
|
||||
@staticmethod
|
||||
def _prepare_metadata(metadata: Optional[Union[Dict, List[Dict]]], paths: List[Union[str, Path]]) -> List[Dict]:
|
||||
"""
|
||||
Prepare the metadata for the Documents.
|
||||
|
||||
:param metadata: The metadata for the Documents.
|
||||
:param paths: The paths to the text files.
|
||||
"""
|
||||
if metadata is None:
|
||||
return [{"file_path": str(path)} for path in paths]
|
||||
|
||||
if isinstance(metadata, dict):
|
||||
metadata = [metadata] * len(paths)
|
||||
|
||||
if len(metadata) != len(paths):
|
||||
raise PipelineRuntimeError(
|
||||
f"The number of metadata entries must match the number of paths if metadata is a list. "
|
||||
f"Number of paths: {len(paths)}, number of metadata entries: {len(metadata)}."
|
||||
)
|
||||
|
||||
return [{**m, "file_path": m.get("file_path", str(path))} for m, path in zip(metadata, paths)]
|
||||
|
||||
def _read_and_clean_file(self, path: Union[str, Path], encoding: str, remove_numeric_tables: bool) -> str:
|
||||
"""
|
||||
Read and clean the text file.
|
||||
|
||||
:param path: The path to the text file.
|
||||
:param encoding: The encoding of the text file.
|
||||
:param remove_numeric_tables: Whether to remove numeric tables.
|
||||
|
||||
:return: The text of the file cleaned from numeric tables if `remove_numeric_tables` is `True`.
|
||||
"""
|
||||
if not Path(path).exists():
|
||||
raise PipelineRuntimeError(f"File at path {path} does not exist.")
|
||||
|
||||
with open(path, encoding=encoding) as file:
|
||||
text = file.read()
|
||||
pages = text.split("\f")
|
||||
cleaned_pages = [self._clean_page(page, remove_numeric_tables) for page in pages]
|
||||
return "\f".join(cleaned_pages)
|
||||
|
||||
def _clean_page(self, page: str, remove_numeric_tables: bool) -> str:
|
||||
"""
|
||||
Clean a page of text from numeric tables if `remove_numeric_tables` is `True`.
|
||||
|
||||
:param page: The content of a page of a text file.
|
||||
:param remove_numeric_tables: Whether to remove numeric tables.
|
||||
|
||||
:return: The text from the page cleaned from numeric tables if `remove_numeric_tables` is `True`.
|
||||
"""
|
||||
cleaned_lines = page.splitlines()
|
||||
if remove_numeric_tables:
|
||||
cleaned_lines = [line for line in cleaned_lines if not self._is_numeric_row(line)]
|
||||
|
||||
return "\n".join(cleaned_lines)
|
||||
|
||||
def _is_numeric_row(self, line: str) -> bool:
|
||||
"""
|
||||
Check if a line of a text file is a numeric row. A line is considered a numeric row if it contains more
|
||||
than 40% digits and does not end with a period.
|
||||
|
||||
:param line: The content of a line of a text file.
|
||||
"""
|
||||
words = line.split()
|
||||
digits = [word for word in words if any(char.isdigit() for char in word)]
|
||||
return len(digits) / len(words) > self.numeric_row_threshold and not line.strip().endswith(".")
|
||||
|
||||
@staticmethod
|
||||
def _validate_language(text: str, valid_languages: List[str]) -> bool:
|
||||
"""
|
||||
Validate if the detected language of the text is one of the valid languages.
|
||||
|
||||
:param text: The text to validate.
|
||||
:param valid_languages: A list of valid languages.
|
||||
"""
|
||||
if not valid_languages:
|
||||
return True
|
||||
|
||||
try:
|
||||
lang = langdetect.detect(text)
|
||||
except langdetect.lang_detect_exception.LangDetectException:
|
||||
lang = None
|
||||
|
||||
return lang in valid_languages
|
||||
|
||||
@ -0,0 +1,3 @@
|
||||
preview:
|
||||
- Remove most parameters from TextFileToDocument to make it match all other converters.
|
||||
- Add support for ByteStreams
|
||||
@ -4,253 +4,66 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from canals.errors import PipelineRuntimeError
|
||||
from langdetect import LangDetectException
|
||||
|
||||
from haystack.preview.dataclasses import ByteStream
|
||||
from haystack.preview.components.file_converters.txt import TextFileToDocument
|
||||
|
||||
|
||||
class TestTextfileToDocument: # pylint: disable=R0904
|
||||
class TestTextfileToDocument:
|
||||
@pytest.mark.unit
|
||||
def test_run(self, preview_samples_path):
|
||||
"""
|
||||
Test if the component runs correctly.
|
||||
"""
|
||||
paths = [preview_samples_path / "txt" / "doc_1.txt", preview_samples_path / "txt" / "doc_2.txt"]
|
||||
bytestream = ByteStream.from_file_path(preview_samples_path / "txt" / "doc_3.txt")
|
||||
bytestream.metadata["file_path"] = str(preview_samples_path / "txt" / "doc_3.txt")
|
||||
bytestream.metadata["key"] = "value"
|
||||
files = [
|
||||
str(preview_samples_path / "txt" / "doc_1.txt"),
|
||||
preview_samples_path / "txt" / "doc_2.txt",
|
||||
bytestream,
|
||||
]
|
||||
converter = TextFileToDocument()
|
||||
output = converter.run(paths=paths)
|
||||
output = converter.run(sources=files)
|
||||
docs = output["documents"]
|
||||
assert len(docs) == 2
|
||||
assert docs[0].content == "Some text for testing.\nTwo lines in here."
|
||||
assert docs[1].content == "This is a test line.\n123 456 789\n987 654 321."
|
||||
assert docs[0].meta["file_path"] == str(paths[0])
|
||||
assert docs[1].meta["file_path"] == str(paths[1])
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_run_warning_for_invalid_language(self, preview_samples_path, caplog):
|
||||
file_path = preview_samples_path / "txt" / "doc_1.txt"
|
||||
converter = TextFileToDocument()
|
||||
with patch(
|
||||
"haystack.preview.components.file_converters.txt.langdetect.detect", return_value="en"
|
||||
), caplog.at_level(logging.WARNING):
|
||||
output = converter.run(paths=[file_path], valid_languages=["de"])
|
||||
assert (
|
||||
f"Text from file {file_path} is not in one of the valid languages: ['de']. "
|
||||
f"The file may have been decoded incorrectly." in caplog.text
|
||||
)
|
||||
|
||||
docs = output["documents"]
|
||||
assert len(docs) == 1
|
||||
assert docs[0].content == "Some text for testing.\nTwo lines in here."
|
||||
assert len(docs) == 3
|
||||
assert "Some text for testing." in docs[0].content
|
||||
assert "This is a test line." in docs[1].content
|
||||
assert "That's yet another file!" in docs[2].content
|
||||
assert docs[0].meta["file_path"] == str(files[0])
|
||||
assert docs[1].meta["file_path"] == str(files[1])
|
||||
assert docs[2].meta == bytestream.metadata
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_run_error_handling(self, preview_samples_path, caplog):
|
||||
"""
|
||||
Test if the component correctly handles errors.
|
||||
"""
|
||||
paths = [preview_samples_path / "txt" / "doc_1.txt", "non_existing_file.txt"]
|
||||
paths = [
|
||||
preview_samples_path / "txt" / "doc_1.txt",
|
||||
"non_existing_file.txt",
|
||||
preview_samples_path / "txt" / "doc_3.txt",
|
||||
]
|
||||
converter = TextFileToDocument()
|
||||
with caplog.at_level(logging.WARNING):
|
||||
output = converter.run(paths=paths)
|
||||
assert (
|
||||
"Could not read file non_existing_file.txt. Skipping it. Error message: File at path non_existing_file.txt does not exist."
|
||||
in caplog.text
|
||||
)
|
||||
output = converter.run(sources=paths)
|
||||
assert "non_existing_file.txt" in caplog.text
|
||||
docs = output["documents"]
|
||||
assert len(docs) == 1
|
||||
assert len(docs) == 2
|
||||
assert docs[0].meta["file_path"] == str(paths[0])
|
||||
assert docs[1].meta["file_path"] == str(paths[2])
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_prepare_metadata_no_metadata(self):
|
||||
def test_encoding_override(self, preview_samples_path):
|
||||
"""
|
||||
Test if the metadata is correctly prepared when no custom metadata is provided.
|
||||
Test if the encoding metadata field is used properly
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
meta = converter._prepare_metadata(
|
||||
metadata=None, paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")]
|
||||
)
|
||||
assert len(meta) == 2
|
||||
assert meta[0]["file_path"] == "data/sample_path_1.txt"
|
||||
assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt"))
|
||||
bytestream = ByteStream.from_file_path(preview_samples_path / "txt" / "doc_1.txt")
|
||||
bytestream.metadata["key"] = "value"
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_prepare_metadata_single_dict(self):
|
||||
"""
|
||||
Test if the metadata is correctly prepared when a single dict is provided.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
meta = converter._prepare_metadata(
|
||||
metadata={"name": "test"}, paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")]
|
||||
)
|
||||
assert len(meta) == 2
|
||||
assert meta[0]["file_path"] == "data/sample_path_1.txt"
|
||||
assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt"))
|
||||
assert meta[0]["name"] == "test"
|
||||
assert meta[1]["name"] == "test"
|
||||
converter = TextFileToDocument(encoding="utf-16")
|
||||
output = converter.run(sources=[bytestream])
|
||||
assert "Some text for testing." not in output["documents"][0].content
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_prepare_metadata_list_of_dicts(self):
|
||||
"""
|
||||
Test if the metadata is correctly prepared when a list of dicts is provided.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
meta = converter._prepare_metadata(
|
||||
metadata=[{"name": "test1"}, {"name": "test2"}],
|
||||
paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")],
|
||||
)
|
||||
assert len(meta) == 2
|
||||
assert meta[0]["file_path"] == "data/sample_path_1.txt"
|
||||
assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt"))
|
||||
assert meta[0]["name"] == "test1"
|
||||
assert meta[1]["name"] == "test2"
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_prepare_metadata_unmatching_list_len(self):
|
||||
"""
|
||||
Test if an error is raised when the number of metadata dicts is not equal to the number of
|
||||
file paths.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
with pytest.raises(
|
||||
PipelineRuntimeError,
|
||||
match="The number of metadata entries must match the number of paths if metadata is a list.",
|
||||
):
|
||||
converter._prepare_metadata(
|
||||
metadata=[{"name": "test1"}, {"name": "test2"}],
|
||||
paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt"), "data/sample_path_3.txt"],
|
||||
)
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_read_and_clean_file(self, preview_samples_path):
|
||||
"""
|
||||
Test if the file is correctly read.
|
||||
"""
|
||||
file_path = preview_samples_path / "txt" / "doc_1.txt"
|
||||
converter = TextFileToDocument()
|
||||
text = converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=False)
|
||||
assert text == "Some text for testing.\nTwo lines in here."
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_read_and_clean_file_non_existing_file(self):
|
||||
"""
|
||||
Test if an error is raised when the file does not exist.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
file_path = "non_existing_file.txt"
|
||||
with pytest.raises(PipelineRuntimeError, match=f"File at path {file_path} does not exist."):
|
||||
converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=False)
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_read_and_clean_file_remove_numeric_tables(self, preview_samples_path):
|
||||
"""
|
||||
Test if the file is correctly read and numeric tables are removed.
|
||||
"""
|
||||
file_path = preview_samples_path / "txt" / "doc_2.txt"
|
||||
converter = TextFileToDocument()
|
||||
text = converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=True)
|
||||
assert text == "This is a test line.\n987 654 321."
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_clean_page_without_remove_numeric_tables(self):
|
||||
"""
|
||||
Test if the page is not changed when remove_numeric_tables is False.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
page = "This is a test line.\n123 456 789"
|
||||
cleaned_page = converter._clean_page(page=page, remove_numeric_tables=False)
|
||||
assert cleaned_page == page
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_clean_page_with_remove_numeric_tables(self):
|
||||
"""
|
||||
Test if the page is correctly cleaned when remove_numeric_tables is True.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
page = "This is a test line.\n123 456 789"
|
||||
cleaned_page = converter._clean_page(page=page, remove_numeric_tables=True)
|
||||
assert cleaned_page == "This is a test line."
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_is_numeric_row_only_numbers(self):
|
||||
"""
|
||||
Test if the line is correctly identified as a numeric row when it only contains numbers.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
line = "123 456 789"
|
||||
assert converter._is_numeric_row(line=line)
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_is_numeric_row_only_text(self):
|
||||
"""
|
||||
Test if the line is correctly identified as a non-numeric row when it only contains text.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
line = "This is a test line."
|
||||
assert not converter._is_numeric_row(line=line)
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_is_numeric_row_only_numbers_with_period(self):
|
||||
"""
|
||||
Test if the line is correctly identified as a non-numeric row when it only contains numbers and a period at
|
||||
the end.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
line = "123 456 789."
|
||||
assert not converter._is_numeric_row(line=line)
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_is_numeric_row_more_numbers_than_text(self):
|
||||
"""
|
||||
Test if the line is correctly identified as a numeric row when it consists of more than 40% of numbers than.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
line = "123 456 789 This is a test"
|
||||
assert converter._is_numeric_row(line=line)
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_is_numeric_row_less_numbers_than_text(self):
|
||||
"""
|
||||
Test if the line is correctly identified as a non-numeric row when it consists of less than 40% of numbers than.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
line = "123 456 789 This is a test line"
|
||||
assert not converter._is_numeric_row(line=line)
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_is_numeric_row_words_consist_of_numbers_and_text(self):
|
||||
"""
|
||||
Test if the line is correctly identified as a numeric row when the words consist of numbers and text.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
line = "123eur 456usd"
|
||||
assert converter._is_numeric_row(line=line)
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_validate_language(self):
|
||||
"""
|
||||
Test if the language is correctly validated.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
with patch("haystack.preview.components.file_converters.txt.langdetect.detect", return_value="en"):
|
||||
assert converter._validate_language(text="This is an english text.", valid_languages=["en"])
|
||||
assert not converter._validate_language(text="This is an english text.", valid_languages=["de"])
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_validate_language_no_languages_specified(self):
|
||||
"""
|
||||
Test if _validate_languages returns True when no languages are specified.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
assert converter._validate_language(text="This is an english test.", valid_languages=[])
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_validate_language_lang_detect_exception(self):
|
||||
"""
|
||||
Test if _validate_languages returns False when langdetect throws an exception.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
with patch(
|
||||
"haystack.preview.components.file_converters.txt.langdetect.detect",
|
||||
side_effect=LangDetectException(code=0, message="Test"),
|
||||
):
|
||||
assert not converter._validate_language(text="This is an english text.", valid_languages=["en"])
|
||||
bytestream.metadata["encoding"] = "utf-8"
|
||||
output = converter.run(sources=[bytestream])
|
||||
assert "Some text for testing." in output["documents"][0].content
|
||||
|
||||
11
test/preview/test_files/txt/doc_3.txt
Normal file
11
test/preview/test_files/txt/doc_3.txt
Normal file
@ -0,0 +1,11 @@
|
||||
That's yet another file!
|
||||
|
||||
it contains
|
||||
|
||||
|
||||
|
||||
|
||||
many
|
||||
|
||||
|
||||
empty lines.
|
||||
Loading…
x
Reference in New Issue
Block a user