mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-27 02:40:41 +00:00

* Unify CI tests (from #2466) * Update Documentation & Code Style * Change folder names * Fix markers list * Remove marker 'slow', replaced with 'integration' * Soften children check * Start ES first so it has time to boot while Python is setup * Run the full workflow * Try to make pip upgrade on Windows * Set KG tests as integration * Update Documentation & Code Style * typo * faster pylint * Make Pylint use the cache * filter diff files for pylint * debug pylint statement * revert pylint changes * Remove path from asserted log (fails on Windows) * Skip preprocessor test on Windows * Tackling Windows specific failures * Fix pytest command for windows suites * Remove \ from command * Move poppler test into integration * Skip opensearch test on windows * Add tolerance in reader sas score for Windows * Another pytorch approx * Raise time limit for unit tests :( * Skip poppler test on Windows CI * Specify to pull with FF only in docs check * temporarily run the docs check immediately * Allow merge commit for now * Try without fetch depth * Accelerating test * Accelerating test * Add repository and ref alongside fetch-depth * Separate out code&docs check from tests * Use setup-python cache * Delete custom action * Remove the pull step in the docs check, will find a way to run on bot commits * Add requirements.txt in .github for caching * Actually install dependencies * Change deps group for pylint * Unclear why the requirements.txt is still required :/ * Fix the code check python setup * Install all deps for pylint * Make the autoformat check depend on tests and doc updates workflows * Try installing dependencies in another order * Try again to install the deps * quoting the paths * Ad back the requirements * Try again to install rest_api and ui * Change deps group * Duplicate haystack install line * See if the cache is the problem * Disable also in mypy, who knows * split the install step * Split install step everywhere * Revert "Separate out code&docs check from tests" This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd. * Add back the action * Proactive support for audio (see text2speech branch) * Fix label generator tests * Remove install of libsndfile1 on win temporarily * exclude audio tests on win * install ffmpeg for integration tests Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
116 lines
4.4 KiB
Python
116 lines
4.4 KiB
Python
import sys
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from haystack import Document
|
|
from haystack.nodes.file_converter.pdf import PDFToTextConverter
|
|
from haystack.nodes.preprocessor.preprocessor import PreProcessor
|
|
|
|
from ..conftest import SAMPLES_PATH
|
|
|
|
TEXT = """
|
|
This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in
|
|
paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1.
|
|
|
|
This is a sample sentence in paragraph_2. This is a sample sentence in paragraph_2. This is a sample sentence in
|
|
paragraph_2. This is a sample sentence in paragraph_2. This is a sample sentence in paragraph_2.
|
|
|
|
This is a sample sentence in paragraph_3. This is a sample sentence in paragraph_3. This is a sample sentence in
|
|
paragraph_3. This is a sample sentence in paragraph_3. This is to trick the test with using an abbreviation like Dr.
|
|
in the sentence.
|
|
"""
|
|
|
|
|
|
def test_preprocess_sentence_split():
|
|
document = Document(content=TEXT)
|
|
preprocessor = PreProcessor(
|
|
split_length=1, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
|
|
)
|
|
documents = preprocessor.process(document)
|
|
assert len(documents) == 15
|
|
|
|
preprocessor = PreProcessor(
|
|
split_length=10, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
|
|
)
|
|
documents = preprocessor.process(document)
|
|
assert len(documents) == 2
|
|
|
|
|
|
def test_preprocess_word_split():
|
|
document = Document(content=TEXT)
|
|
preprocessor = PreProcessor(
|
|
split_length=10, split_overlap=0, split_by="word", split_respect_sentence_boundary=False
|
|
)
|
|
documents = preprocessor.process(document)
|
|
assert len(documents) == 11
|
|
|
|
preprocessor = PreProcessor(split_length=15, split_overlap=0, split_by="word", split_respect_sentence_boundary=True)
|
|
documents = preprocessor.process(document)
|
|
for i, doc in enumerate(documents):
|
|
if i == 0:
|
|
assert len(doc.content.split(" ")) == 14
|
|
assert len(doc.content.split(" ")) <= 15 or doc.content.startswith("This is to trick")
|
|
assert len(documents) == 8
|
|
|
|
preprocessor = PreProcessor(
|
|
split_length=40, split_overlap=10, split_by="word", split_respect_sentence_boundary=True
|
|
)
|
|
documents = preprocessor.process(document)
|
|
assert len(documents) == 5
|
|
|
|
preprocessor = PreProcessor(split_length=5, split_overlap=0, split_by="word", split_respect_sentence_boundary=True)
|
|
documents = preprocessor.process(document)
|
|
assert len(documents) == 15
|
|
|
|
|
|
def test_preprocess_passage_split():
|
|
document = Document(content=TEXT)
|
|
preprocessor = PreProcessor(
|
|
split_length=1, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
|
|
)
|
|
documents = preprocessor.process(document)
|
|
assert len(documents) == 3
|
|
|
|
preprocessor = PreProcessor(
|
|
split_length=2, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
|
|
)
|
|
documents = preprocessor.process(document)
|
|
assert len(documents) == 2
|
|
|
|
|
|
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="FIXME Footer not detected correctly on Windows")
|
|
def test_clean_header_footer():
|
|
converter = PDFToTextConverter()
|
|
document = converter.convert(
|
|
file_path=Path(SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")
|
|
) # file contains header/footer
|
|
|
|
preprocessor = PreProcessor(clean_header_footer=True, split_by=None)
|
|
documents = preprocessor.process(document)
|
|
|
|
assert len(documents) == 1
|
|
|
|
assert "This is a header." not in documents[0].content
|
|
assert "footer" not in documents[0].content
|
|
|
|
|
|
def test_remove_substrings():
|
|
document = Document(content="This is a header. Some additional text. wiki. Some emoji ✨ 🪲 Weird whitespace\b\b\b.")
|
|
|
|
# check that the file contains the substrings we are about to remove
|
|
assert "This is a header." in document.content
|
|
assert "wiki" in document.content
|
|
assert "🪲" in document.content
|
|
assert "whitespace" in document.content
|
|
assert "✨" in document.content
|
|
|
|
preprocessor = PreProcessor(remove_substrings=["This is a header.", "wiki", "🪲"])
|
|
documents = preprocessor.process(document)
|
|
|
|
assert "This is a header." not in documents[0].content
|
|
assert "wiki" not in documents[0].content
|
|
assert "🪲" not in documents[0].content
|
|
assert "whitespace" in documents[0].content
|
|
assert "✨" in documents[0].content
|