fix: Run update_embeddings in examples (#6008)

* added hybrid search example Added an example about hybrid search for faq pipeline on covid dataset * formatted with back formatter * renamed document * fixed * fixed typos * added test added test for hybrid search * fixed withespaces * removed test for hybrid search * fixed pylint * commented logging * updated hybrid search example * release notes * Update hybrid_search_faq_pipeline.py-815df846dca7e872.yaml * Update hybrid_search_faq_pipeline.py * mention hybrid search example in release notes * reduce installed dependencies in examples test workflow * do not install cuda dependencies * skip models if API key not set; delete document indices * skip models if API key not set; delete document indices * skip models if API key not set; delete document indices * keep roberta-base model and inference extra * pylint * disable pylint no-logging-basicconfig rule --------- Co-authored-by: Julian Risch <julian.risch@deepset.ai>
2025-10-06 13:35:42 +00:00 · 2023-10-10 16:38:52 +02:00 · 2023-10-10 16:38:52 +02:00 · c102b152dc
commit c102b152dc
parent c05f564359
7 changed files with 46 additions and 27 deletions
--- a/.github/workflows/examples_tests.yml
+++ b/.github/workflows/examples_tests.yml
@ -42,7 +42,9 @@ jobs:
          python-version: ${{ env.PYTHON_VERSION }}

      - name: Install Haystack
-        run: pip install .[all,dev]
+        run: |
+          pip install --upgrade pip
+          pip install .[inference,dev,elasticsearch,preprocessing,file-conversion]

      - name: Run
        run: pytest examples/
--- a/examples/basic_faq_pipeline.py
+++ b/examples/basic_faq_pipeline.py
@ -1,23 +1,25 @@
+# Disable pylint errors for logging basicConfig
+# pylint: disable=no-logging-basicconfig
 import logging

+import pandas as pd
+
+from haystack.document_stores import ElasticsearchDocumentStore
+from haystack.nodes import EmbeddingRetriever
+from haystack.nodes.other.docs2answers import Docs2Answers
+from haystack.pipelines import Pipeline
+from haystack.utils import fetch_archive_from_http, launch_es, print_answers
+
 logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
 logging.getLogger("haystack").setLevel(logging.INFO)

-from haystack.document_stores import ElasticsearchDocumentStore
-
-from haystack.nodes import EmbeddingRetriever
-from haystack.nodes.other.docs2answers import Docs2Answers
-from haystack.utils import launch_es, print_answers, fetch_archive_from_http
-import pandas as pd
-from haystack.pipelines import Pipeline
-

 def basic_faq_pipeline():
    document_store = ElasticsearchDocumentStore(
        host="localhost",
        username="",
        password="",
-        index="document",
+        index="example-document",
        embedding_field="question_emb",
        embedding_dim=384,
        excluded_meta_data=["question_emb"],
@ -52,6 +54,7 @@ def basic_faq_pipeline():
    # Convert Dataframe to list of dicts and index them in our DocumentStore
    docs_to_index = df.to_dict(orient="records")
    document_store.write_documents(docs_to_index)
+    document_store.update_embeddings(retriever)

    # Initialize a Pipeline (this time without a reader) and ask questions
    pipeline = Pipeline()
@ -62,6 +65,9 @@ def basic_faq_pipeline():
    prediction = pipeline.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}})

    print_answers(prediction, details="medium")
+
+    # Remove the index once we're done to save space
+    document_store.delete_index(index="example-document")
    return prediction


--- a/examples/basic_qa_pipeline.py
+++ b/examples/basic_qa_pipeline.py
@ -1,21 +1,23 @@
+# Disable pylint errors for logging basicConfig
+# pylint: disable=no-logging-basicconfig
 import logging
 from pathlib import Path

+from haystack.document_stores import ElasticsearchDocumentStore
+from haystack.nodes import BM25Retriever, FARMReader
+from haystack.nodes.file_classifier import FileTypeClassifier
+from haystack.nodes.file_converter import TextConverter
+from haystack.nodes.preprocessor import PreProcessor
+from haystack.pipelines import Pipeline
+from haystack.utils import fetch_archive_from_http, launch_es, print_answers
+
 logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
 logging.getLogger("haystack").setLevel(logging.INFO)

-from haystack.document_stores import ElasticsearchDocumentStore
-from haystack.utils import fetch_archive_from_http, print_answers, launch_es
-from haystack.nodes import FARMReader, BM25Retriever
-from haystack.nodes.file_classifier import FileTypeClassifier
-from haystack.nodes.preprocessor import PreProcessor
-from haystack.nodes.file_converter import TextConverter
-from haystack.pipelines import Pipeline
-

 def basic_qa_pipeline():
    # Initialize a DocumentStore
-    document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
+    document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="example-document")

    # fetch, pre-process and write documents
    doc_dir = "data/basic_qa_pipeline"
@ -66,6 +68,9 @@ def basic_qa_pipeline():
    )

    print_answers(prediction, details="minimum")
+
+    # Remove the index once we're done to save space
+    document_store.delete_index(index="example-document")
    return prediction


--- a/examples/hybrid_search_faq_pipeline.py
+++ b/examples/hybrid_search_faq_pipeline.py
@ -55,6 +55,7 @@ def hybrid_search_faq_pipeline():
    # Convert Dataframe to list of dicts and index them in our DocumentStore
    docs_to_index = df.to_dict(orient="records")
    document_store.write_documents(docs_to_index)
+    document_store.update_embeddings(retriever=dense_retriever)

    # Initialize a Pipeline (this time without a reader) and ask questions
    pipeline = Pipeline()
--- a/examples/test_basic_faq_pipeline.py
+++ b/examples/test_basic_faq_pipeline.py
@ -1,6 +1,6 @@
 from examples.basic_faq_pipeline import basic_faq_pipeline

-from haystack.schema import Answer, Document
+from haystack.schema import Answer


 def test_basic_faq_pipeline():
--- a/examples/test_getting_started.py
+++ b/examples/test_getting_started.py
@ -6,8 +6,7 @@ from examples.getting_started import getting_started
 from haystack.schema import Answer, Document


-@pytest.mark.integration
-@pytest.mark.parametrize("provider", ["cohere", "huggingface", "openai"])
+@pytest.mark.parametrize("provider", ["anthropic", "cohere", "huggingface", "openai"])
 def test_getting_started(provider):
    if provider == "anthropic":
        api_key = os.environ.get("ANTHROPIC_API_KEY", "")
@ -17,6 +16,8 @@ def test_getting_started(provider):
        api_key = os.environ.get("HUGGINGFACE_API_KEY", "")
    elif provider == "openai":
        api_key = os.environ.get("OPENAI_API_KEY", "")
+
+    if api_key:
        result = getting_started(provider=provider, API_KEY=api_key)

        # Testing only for functionality. Since model predictions from APIs might change, we cannot test those directly.
--- a/releasenotes/notes/hybrid_search_faq_pipeline.py-815df846dca7e872.yaml
+++ b/releasenotes/notes/hybrid_search_faq_pipeline.py-815df846dca7e872.yaml
@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    Added documents_store.update_embeddings call to pipeline examples so that embeddings are calculated for newly added documents.