mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-08 06:26:48 +00:00
fix: Run update_embeddings in examples (#6008)
* added hybrid search example Added an example about hybrid search for faq pipeline on covid dataset * formatted with back formatter * renamed document * fixed * fixed typos * added test added test for hybrid search * fixed withespaces * removed test for hybrid search * fixed pylint * commented logging * updated hybrid search example * release notes * Update hybrid_search_faq_pipeline.py-815df846dca7e872.yaml * Update hybrid_search_faq_pipeline.py * mention hybrid search example in release notes * reduce installed dependencies in examples test workflow * do not install cuda dependencies * skip models if API key not set; delete document indices * skip models if API key not set; delete document indices * skip models if API key not set; delete document indices * keep roberta-base model and inference extra * pylint * disable pylint no-logging-basicconfig rule --------- Co-authored-by: Julian Risch <julian.risch@deepset.ai>
This commit is contained in:
parent
c05f564359
commit
c102b152dc
4
.github/workflows/examples_tests.yml
vendored
4
.github/workflows/examples_tests.yml
vendored
@ -42,7 +42,9 @@ jobs:
|
|||||||
python-version: ${{ env.PYTHON_VERSION }}
|
python-version: ${{ env.PYTHON_VERSION }}
|
||||||
|
|
||||||
- name: Install Haystack
|
- name: Install Haystack
|
||||||
run: pip install .[all,dev]
|
run: |
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install .[inference,dev,elasticsearch,preprocessing,file-conversion]
|
||||||
|
|
||||||
- name: Run
|
- name: Run
|
||||||
run: pytest examples/
|
run: pytest examples/
|
||||||
|
@ -1,23 +1,25 @@
|
|||||||
|
# Disable pylint errors for logging basicConfig
|
||||||
|
# pylint: disable=no-logging-basicconfig
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from haystack.document_stores import ElasticsearchDocumentStore
|
||||||
|
from haystack.nodes import EmbeddingRetriever
|
||||||
|
from haystack.nodes.other.docs2answers import Docs2Answers
|
||||||
|
from haystack.pipelines import Pipeline
|
||||||
|
from haystack.utils import fetch_archive_from_http, launch_es, print_answers
|
||||||
|
|
||||||
logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
|
logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
|
||||||
logging.getLogger("haystack").setLevel(logging.INFO)
|
logging.getLogger("haystack").setLevel(logging.INFO)
|
||||||
|
|
||||||
from haystack.document_stores import ElasticsearchDocumentStore
|
|
||||||
|
|
||||||
from haystack.nodes import EmbeddingRetriever
|
|
||||||
from haystack.nodes.other.docs2answers import Docs2Answers
|
|
||||||
from haystack.utils import launch_es, print_answers, fetch_archive_from_http
|
|
||||||
import pandas as pd
|
|
||||||
from haystack.pipelines import Pipeline
|
|
||||||
|
|
||||||
|
|
||||||
def basic_faq_pipeline():
|
def basic_faq_pipeline():
|
||||||
document_store = ElasticsearchDocumentStore(
|
document_store = ElasticsearchDocumentStore(
|
||||||
host="localhost",
|
host="localhost",
|
||||||
username="",
|
username="",
|
||||||
password="",
|
password="",
|
||||||
index="document",
|
index="example-document",
|
||||||
embedding_field="question_emb",
|
embedding_field="question_emb",
|
||||||
embedding_dim=384,
|
embedding_dim=384,
|
||||||
excluded_meta_data=["question_emb"],
|
excluded_meta_data=["question_emb"],
|
||||||
@ -52,6 +54,7 @@ def basic_faq_pipeline():
|
|||||||
# Convert Dataframe to list of dicts and index them in our DocumentStore
|
# Convert Dataframe to list of dicts and index them in our DocumentStore
|
||||||
docs_to_index = df.to_dict(orient="records")
|
docs_to_index = df.to_dict(orient="records")
|
||||||
document_store.write_documents(docs_to_index)
|
document_store.write_documents(docs_to_index)
|
||||||
|
document_store.update_embeddings(retriever)
|
||||||
|
|
||||||
# Initialize a Pipeline (this time without a reader) and ask questions
|
# Initialize a Pipeline (this time without a reader) and ask questions
|
||||||
pipeline = Pipeline()
|
pipeline = Pipeline()
|
||||||
@ -62,6 +65,9 @@ def basic_faq_pipeline():
|
|||||||
prediction = pipeline.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}})
|
prediction = pipeline.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}})
|
||||||
|
|
||||||
print_answers(prediction, details="medium")
|
print_answers(prediction, details="medium")
|
||||||
|
|
||||||
|
# Remove the index once we're done to save space
|
||||||
|
document_store.delete_index(index="example-document")
|
||||||
return prediction
|
return prediction
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,21 +1,23 @@
|
|||||||
|
# Disable pylint errors for logging basicConfig
|
||||||
|
# pylint: disable=no-logging-basicconfig
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from haystack.document_stores import ElasticsearchDocumentStore
|
||||||
|
from haystack.nodes import BM25Retriever, FARMReader
|
||||||
|
from haystack.nodes.file_classifier import FileTypeClassifier
|
||||||
|
from haystack.nodes.file_converter import TextConverter
|
||||||
|
from haystack.nodes.preprocessor import PreProcessor
|
||||||
|
from haystack.pipelines import Pipeline
|
||||||
|
from haystack.utils import fetch_archive_from_http, launch_es, print_answers
|
||||||
|
|
||||||
logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
|
logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
|
||||||
logging.getLogger("haystack").setLevel(logging.INFO)
|
logging.getLogger("haystack").setLevel(logging.INFO)
|
||||||
|
|
||||||
from haystack.document_stores import ElasticsearchDocumentStore
|
|
||||||
from haystack.utils import fetch_archive_from_http, print_answers, launch_es
|
|
||||||
from haystack.nodes import FARMReader, BM25Retriever
|
|
||||||
from haystack.nodes.file_classifier import FileTypeClassifier
|
|
||||||
from haystack.nodes.preprocessor import PreProcessor
|
|
||||||
from haystack.nodes.file_converter import TextConverter
|
|
||||||
from haystack.pipelines import Pipeline
|
|
||||||
|
|
||||||
|
|
||||||
def basic_qa_pipeline():
|
def basic_qa_pipeline():
|
||||||
# Initialize a DocumentStore
|
# Initialize a DocumentStore
|
||||||
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
|
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="example-document")
|
||||||
|
|
||||||
# fetch, pre-process and write documents
|
# fetch, pre-process and write documents
|
||||||
doc_dir = "data/basic_qa_pipeline"
|
doc_dir = "data/basic_qa_pipeline"
|
||||||
@ -66,6 +68,9 @@ def basic_qa_pipeline():
|
|||||||
)
|
)
|
||||||
|
|
||||||
print_answers(prediction, details="minimum")
|
print_answers(prediction, details="minimum")
|
||||||
|
|
||||||
|
# Remove the index once we're done to save space
|
||||||
|
document_store.delete_index(index="example-document")
|
||||||
return prediction
|
return prediction
|
||||||
|
|
||||||
|
|
||||||
|
@ -55,6 +55,7 @@ def hybrid_search_faq_pipeline():
|
|||||||
# Convert Dataframe to list of dicts and index them in our DocumentStore
|
# Convert Dataframe to list of dicts and index them in our DocumentStore
|
||||||
docs_to_index = df.to_dict(orient="records")
|
docs_to_index = df.to_dict(orient="records")
|
||||||
document_store.write_documents(docs_to_index)
|
document_store.write_documents(docs_to_index)
|
||||||
|
document_store.update_embeddings(retriever=dense_retriever)
|
||||||
|
|
||||||
# Initialize a Pipeline (this time without a reader) and ask questions
|
# Initialize a Pipeline (this time without a reader) and ask questions
|
||||||
pipeline = Pipeline()
|
pipeline = Pipeline()
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from examples.basic_faq_pipeline import basic_faq_pipeline
|
from examples.basic_faq_pipeline import basic_faq_pipeline
|
||||||
|
|
||||||
from haystack.schema import Answer, Document
|
from haystack.schema import Answer
|
||||||
|
|
||||||
|
|
||||||
def test_basic_faq_pipeline():
|
def test_basic_faq_pipeline():
|
||||||
|
@ -6,8 +6,7 @@ from examples.getting_started import getting_started
|
|||||||
from haystack.schema import Answer, Document
|
from haystack.schema import Answer, Document
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.integration
|
@pytest.mark.parametrize("provider", ["anthropic", "cohere", "huggingface", "openai"])
|
||||||
@pytest.mark.parametrize("provider", ["cohere", "huggingface", "openai"])
|
|
||||||
def test_getting_started(provider):
|
def test_getting_started(provider):
|
||||||
if provider == "anthropic":
|
if provider == "anthropic":
|
||||||
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
|
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
|
||||||
@ -17,6 +16,8 @@ def test_getting_started(provider):
|
|||||||
api_key = os.environ.get("HUGGINGFACE_API_KEY", "")
|
api_key = os.environ.get("HUGGINGFACE_API_KEY", "")
|
||||||
elif provider == "openai":
|
elif provider == "openai":
|
||||||
api_key = os.environ.get("OPENAI_API_KEY", "")
|
api_key = os.environ.get("OPENAI_API_KEY", "")
|
||||||
|
|
||||||
|
if api_key:
|
||||||
result = getting_started(provider=provider, API_KEY=api_key)
|
result = getting_started(provider=provider, API_KEY=api_key)
|
||||||
|
|
||||||
# Testing only for functionality. Since model predictions from APIs might change, we cannot test those directly.
|
# Testing only for functionality. Since model predictions from APIs might change, we cannot test those directly.
|
||||||
|
@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
fixes:
|
||||||
|
- |
|
||||||
|
Added documents_store.update_embeddings call to pipeline examples so that embeddings are calculated for newly added documents.
|
Loading…
x
Reference in New Issue
Block a user