fix: Run update_embeddings in examples (#6008)

* added hybrid search example

Added an example about hybrid search for faq pipeline on covid dataset

* formatted with back formatter

* renamed document

* fixed

* fixed typos

* added test

added test for hybrid search

* fixed withespaces

* removed test for hybrid search

* fixed pylint

* commented logging

* updated hybrid search example

* release notes

* Update hybrid_search_faq_pipeline.py-815df846dca7e872.yaml

* Update hybrid_search_faq_pipeline.py

* mention hybrid search example in release notes

* reduce installed dependencies in examples test workflow

* do not install cuda dependencies

* skip models if API key not set; delete document indices

* skip models if API key not set; delete document indices

* skip models if API key not set; delete document indices

* keep roberta-base model and inference extra

* pylint

* disable pylint no-logging-basicconfig rule

---------

Co-authored-by: Julian Risch <julian.risch@deepset.ai>
This commit is contained in:
Nicola Procopio 2023-10-10 16:38:52 +02:00 committed by GitHub
parent c05f564359
commit c102b152dc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 46 additions and 27 deletions

View File

@ -42,7 +42,9 @@ jobs:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Haystack
run: pip install .[all,dev]
run: |
pip install --upgrade pip
pip install .[inference,dev,elasticsearch,preprocessing,file-conversion]
- name: Run
run: pytest examples/

View File

@ -1,23 +1,25 @@
# Disable pylint errors for logging basicConfig
# pylint: disable=no-logging-basicconfig
import logging
import pandas as pd
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import EmbeddingRetriever
from haystack.nodes.other.docs2answers import Docs2Answers
from haystack.pipelines import Pipeline
from haystack.utils import fetch_archive_from_http, launch_es, print_answers
logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import EmbeddingRetriever
from haystack.nodes.other.docs2answers import Docs2Answers
from haystack.utils import launch_es, print_answers, fetch_archive_from_http
import pandas as pd
from haystack.pipelines import Pipeline
def basic_faq_pipeline():
document_store = ElasticsearchDocumentStore(
host="localhost",
username="",
password="",
index="document",
index="example-document",
embedding_field="question_emb",
embedding_dim=384,
excluded_meta_data=["question_emb"],
@ -52,6 +54,7 @@ def basic_faq_pipeline():
# Convert Dataframe to list of dicts and index them in our DocumentStore
docs_to_index = df.to_dict(orient="records")
document_store.write_documents(docs_to_index)
document_store.update_embeddings(retriever)
# Initialize a Pipeline (this time without a reader) and ask questions
pipeline = Pipeline()
@ -62,6 +65,9 @@ def basic_faq_pipeline():
prediction = pipeline.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}})
print_answers(prediction, details="medium")
# Remove the index once we're done to save space
document_store.delete_index(index="example-document")
return prediction

View File

@ -1,21 +1,23 @@
# Disable pylint errors for logging basicConfig
# pylint: disable=no-logging-basicconfig
import logging
from pathlib import Path
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import BM25Retriever, FARMReader
from haystack.nodes.file_classifier import FileTypeClassifier
from haystack.nodes.file_converter import TextConverter
from haystack.nodes.preprocessor import PreProcessor
from haystack.pipelines import Pipeline
from haystack.utils import fetch_archive_from_http, launch_es, print_answers
logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.utils import fetch_archive_from_http, print_answers, launch_es
from haystack.nodes import FARMReader, BM25Retriever
from haystack.nodes.file_classifier import FileTypeClassifier
from haystack.nodes.preprocessor import PreProcessor
from haystack.nodes.file_converter import TextConverter
from haystack.pipelines import Pipeline
def basic_qa_pipeline():
# Initialize a DocumentStore
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="example-document")
# fetch, pre-process and write documents
doc_dir = "data/basic_qa_pipeline"
@ -66,6 +68,9 @@ def basic_qa_pipeline():
)
print_answers(prediction, details="minimum")
# Remove the index once we're done to save space
document_store.delete_index(index="example-document")
return prediction

View File

@ -55,6 +55,7 @@ def hybrid_search_faq_pipeline():
# Convert Dataframe to list of dicts and index them in our DocumentStore
docs_to_index = df.to_dict(orient="records")
document_store.write_documents(docs_to_index)
document_store.update_embeddings(retriever=dense_retriever)
# Initialize a Pipeline (this time without a reader) and ask questions
pipeline = Pipeline()

View File

@ -1,6 +1,6 @@
from examples.basic_faq_pipeline import basic_faq_pipeline
from haystack.schema import Answer, Document
from haystack.schema import Answer
def test_basic_faq_pipeline():

View File

@ -6,8 +6,7 @@ from examples.getting_started import getting_started
from haystack.schema import Answer, Document
@pytest.mark.integration
@pytest.mark.parametrize("provider", ["cohere", "huggingface", "openai"])
@pytest.mark.parametrize("provider", ["anthropic", "cohere", "huggingface", "openai"])
def test_getting_started(provider):
if provider == "anthropic":
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
@ -17,6 +16,8 @@ def test_getting_started(provider):
api_key = os.environ.get("HUGGINGFACE_API_KEY", "")
elif provider == "openai":
api_key = os.environ.get("OPENAI_API_KEY", "")
if api_key:
result = getting_started(provider=provider, API_KEY=api_key)
# Testing only for functionality. Since model predictions from APIs might change, we cannot test those directly.

View File

@ -0,0 +1,4 @@
---
fixes:
- |
Added documents_store.update_embeddings call to pipeline examples so that embeddings are calculated for newly added documents.