mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-30 19:36:23 +00:00

* draft DocumentJoiner * implement merge and rrf * draft end-to-end test with DocumentJoiner in hybrid doc search pipeline * adjust for variadics Canals PR #122 * fix text_embedder input * adapt to the new Document class * adapt to new doc id * specify documents input as Variadic in run method * compare doc ids instead of full docs * rename text_file_converter input to sources * update docstring * Update haystack/preview/components/routers/document_joiner.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from docstring review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * capitalize Documents and Retrievers in docstrings * fix log message in test --------- Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
58 lines
2.7 KiB
Python
58 lines
2.7 KiB
Python
import json
|
|
|
|
from haystack.preview import Pipeline, Document
|
|
from haystack.preview.components.embedders import SentenceTransformersTextEmbedder
|
|
from haystack.preview.components.rankers import TransformersSimilarityRanker
|
|
from haystack.preview.components.routers.document_joiner import DocumentJoiner
|
|
from haystack.preview.document_stores import InMemoryDocumentStore
|
|
from haystack.preview.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
|
|
|
|
|
|
def test_hybrid_doc_search_pipeline(tmp_path):
|
|
# Create the pipeline
|
|
document_store = InMemoryDocumentStore()
|
|
hybrid_pipeline = Pipeline()
|
|
hybrid_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=document_store), name="bm25_retriever")
|
|
hybrid_pipeline.add_component(
|
|
instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
|
|
name="text_embedder",
|
|
)
|
|
hybrid_pipeline.add_component(
|
|
instance=InMemoryEmbeddingRetriever(document_store=document_store), name="embedding_retriever"
|
|
)
|
|
hybrid_pipeline.add_component(instance=DocumentJoiner(), name="joiner")
|
|
hybrid_pipeline.add_component(instance=TransformersSimilarityRanker(top_k=20), name="ranker")
|
|
|
|
hybrid_pipeline.connect("bm25_retriever", "joiner")
|
|
hybrid_pipeline.connect("text_embedder", "embedding_retriever")
|
|
hybrid_pipeline.connect("embedding_retriever", "joiner")
|
|
hybrid_pipeline.connect("joiner", "ranker")
|
|
|
|
# Draw the pipeline
|
|
hybrid_pipeline.draw(tmp_path / "test_hybrid_doc_search_pipeline.png")
|
|
|
|
# Serialize the pipeline to JSON
|
|
with open(tmp_path / "test_hybrid_doc_search_pipeline.json", "w") as f:
|
|
print(json.dumps(hybrid_pipeline.to_dict(), indent=4))
|
|
json.dump(hybrid_pipeline.to_dict(), f)
|
|
|
|
# Load the pipeline back
|
|
with open(tmp_path / "test_hybrid_doc_search_pipeline.json", "r") as f:
|
|
hybrid_pipeline = Pipeline.from_dict(json.load(f))
|
|
|
|
# Populate the document store
|
|
documents = [
|
|
Document(content="My name is Jean and I live in Paris."),
|
|
Document(content="My name is Mark and I live in Berlin."),
|
|
Document(content="My name is Mario and I live in the capital of Italy."),
|
|
Document(content="My name is Giorgio and I live in Rome."),
|
|
]
|
|
hybrid_pipeline.get_component("bm25_retriever").document_store.write_documents(documents)
|
|
|
|
query = "Who lives in Rome?"
|
|
result = hybrid_pipeline.run(
|
|
{"bm25_retriever": {"query": query}, "text_embedder": {"text": query}, "ranker": {"query": query}}
|
|
)
|
|
assert result["ranker"]["documents"][0].content == "My name is Giorgio and I live in Rome."
|
|
assert result["ranker"]["documents"][1].content == "My name is Mario and I live in the capital of Italy."
|