mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-27 19:00:35 +00:00

* Unify CI tests (from #2466) * Update Documentation & Code Style * Change folder names * Fix markers list * Remove marker 'slow', replaced with 'integration' * Soften children check * Start ES first so it has time to boot while Python is setup * Run the full workflow * Try to make pip upgrade on Windows * Set KG tests as integration * Update Documentation & Code Style * typo * faster pylint * Make Pylint use the cache * filter diff files for pylint * debug pylint statement * revert pylint changes * Remove path from asserted log (fails on Windows) * Skip preprocessor test on Windows * Tackling Windows specific failures * Fix pytest command for windows suites * Remove \ from command * Move poppler test into integration * Skip opensearch test on windows * Add tolerance in reader sas score for Windows * Another pytorch approx * Raise time limit for unit tests :( * Skip poppler test on Windows CI * Specify to pull with FF only in docs check * temporarily run the docs check immediately * Allow merge commit for now * Try without fetch depth * Accelerating test * Accelerating test * Add repository and ref alongside fetch-depth * Separate out code&docs check from tests * Use setup-python cache * Delete custom action * Remove the pull step in the docs check, will find a way to run on bot commits * Add requirements.txt in .github for caching * Actually install dependencies * Change deps group for pylint * Unclear why the requirements.txt is still required :/ * Fix the code check python setup * Install all deps for pylint * Make the autoformat check depend on tests and doc updates workflows * Try installing dependencies in another order * Try again to install the deps * quoting the paths * Ad back the requirements * Try again to install rest_api and ui * Change deps group * Duplicate haystack install line * See if the cache is the problem * Disable also in mypy, who knows * split the install step * Split install step everywhere * Revert "Separate out code&docs check from tests" This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd. * Add back the action * Proactive support for audio (see text2speech branch) * Fix label generator tests * Remove install of libsndfile1 on win temporarily * exclude audio tests on win * install ffmpeg for integration tests Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
112 lines
5.6 KiB
Python
112 lines
5.6 KiB
Python
import pytest
|
|
|
|
from haystack.schema import Document
|
|
from haystack.pipelines import SearchSummarizationPipeline
|
|
from haystack.nodes import DensePassageRetriever, EmbeddingRetriever
|
|
|
|
DOCS = [
|
|
Document(
|
|
content="""PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
|
|
),
|
|
Document(
|
|
content="""The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct."""
|
|
),
|
|
]
|
|
|
|
EXPECTED_SUMMARIES = [
|
|
"California's largest electricity provider has turned off power to hundreds of thousands of customers.",
|
|
"The Eiffel Tower is a landmark in Paris, France.",
|
|
]
|
|
|
|
SPLIT_DOCS = [
|
|
Document(
|
|
content="""The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930."""
|
|
),
|
|
Document(
|
|
content="""It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct."""
|
|
),
|
|
]
|
|
|
|
# Documents order is very important to produce summary.
|
|
# Different order of same documents produce different summary.
|
|
EXPECTED_ONE_SUMMARIES = [
|
|
"The Eiffel Tower is a landmark in Paris, France.",
|
|
"The Eiffel Tower, built in 1889 in Paris, France, is the world's tallest free-standing structure.",
|
|
]
|
|
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.summarizer
|
|
def test_summarization(summarizer):
|
|
summarized_docs = summarizer.predict(documents=DOCS)
|
|
assert len(summarized_docs) == len(DOCS)
|
|
for expected_summary, summary in zip(EXPECTED_SUMMARIES, summarized_docs):
|
|
assert expected_summary == summary.content
|
|
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.summarizer
|
|
def test_summarization_one_summary(summarizer):
|
|
summarized_docs = summarizer.predict(documents=SPLIT_DOCS, generate_single_summary=True)
|
|
assert len(summarized_docs) == 1
|
|
assert EXPECTED_ONE_SUMMARIES[0] == summarized_docs[0].content
|
|
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.summarizer
|
|
def test_summarization_batch_single_doc_list(summarizer):
|
|
summarized_docs = summarizer.predict_batch(documents=DOCS)
|
|
assert len(summarized_docs) == len(DOCS)
|
|
for expected_summary, summary in zip(EXPECTED_SUMMARIES, summarized_docs):
|
|
assert expected_summary == summary.content
|
|
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.summarizer
|
|
def test_summarization_batch_multiple_doc_lists(summarizer):
|
|
summarized_docs = summarizer.predict_batch(documents=[DOCS, DOCS])
|
|
assert len(summarized_docs) == 2 # Number of document lists
|
|
assert len(summarized_docs[0]) == len(DOCS)
|
|
for expected_summary, summary in zip(EXPECTED_SUMMARIES, summarized_docs[0]):
|
|
assert expected_summary == summary.content
|
|
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.summarizer
|
|
@pytest.mark.parametrize(
|
|
"retriever,document_store", [("embedding", "memory"), ("elasticsearch", "elasticsearch")], indirect=True
|
|
)
|
|
def test_summarization_pipeline(document_store, retriever, summarizer):
|
|
document_store.write_documents(DOCS)
|
|
|
|
if isinstance(retriever, EmbeddingRetriever) or isinstance(retriever, DensePassageRetriever):
|
|
document_store.update_embeddings(retriever=retriever)
|
|
|
|
query = "Where is Eiffel Tower?"
|
|
pipeline = SearchSummarizationPipeline(retriever=retriever, summarizer=summarizer, return_in_answer_format=True)
|
|
output = pipeline.run(query=query, params={"Retriever": {"top_k": 1}})
|
|
answers = output["answers"]
|
|
assert len(answers) == 1
|
|
assert "The Eiffel Tower is a landmark in Paris, France." == answers[0]["answer"]
|
|
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.summarizer
|
|
@pytest.mark.parametrize(
|
|
"retriever,document_store", [("embedding", "memory"), ("elasticsearch", "elasticsearch")], indirect=True
|
|
)
|
|
def test_summarization_pipeline_one_summary(document_store, retriever, summarizer):
|
|
document_store.write_documents(SPLIT_DOCS)
|
|
|
|
if isinstance(retriever, EmbeddingRetriever) or isinstance(retriever, DensePassageRetriever):
|
|
document_store.update_embeddings(retriever=retriever)
|
|
|
|
query = "Where is Eiffel Tower?"
|
|
pipeline = SearchSummarizationPipeline(retriever=retriever, summarizer=summarizer, return_in_answer_format=True)
|
|
output = pipeline.run(
|
|
query=query, params={"Retriever": {"top_k": 2}, "Summarizer": {"generate_single_summary": True}}
|
|
)
|
|
answers = output["answers"]
|
|
assert len(answers) == 1
|
|
assert answers[0]["answer"] in EXPECTED_ONE_SUMMARIES
|