diff --git a/.github/workflows/examples_tests.yml b/.github/workflows/examples_tests.yml index ac1acb095..916248c9c 100644 --- a/.github/workflows/examples_tests.yml +++ b/.github/workflows/examples_tests.yml @@ -42,7 +42,9 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - name: Install Haystack - run: pip install .[all,dev] + run: | + pip install --upgrade pip + pip install .[inference,dev,elasticsearch,preprocessing,file-conversion] - name: Run run: pytest examples/ diff --git a/examples/basic_faq_pipeline.py b/examples/basic_faq_pipeline.py index 50eacbca9..e198ca536 100644 --- a/examples/basic_faq_pipeline.py +++ b/examples/basic_faq_pipeline.py @@ -1,23 +1,25 @@ +# Disable pylint errors for logging basicConfig +# pylint: disable=no-logging-basicconfig import logging +import pandas as pd + +from haystack.document_stores import ElasticsearchDocumentStore +from haystack.nodes import EmbeddingRetriever +from haystack.nodes.other.docs2answers import Docs2Answers +from haystack.pipelines import Pipeline +from haystack.utils import fetch_archive_from_http, launch_es, print_answers + logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) logging.getLogger("haystack").setLevel(logging.INFO) -from haystack.document_stores import ElasticsearchDocumentStore - -from haystack.nodes import EmbeddingRetriever -from haystack.nodes.other.docs2answers import Docs2Answers -from haystack.utils import launch_es, print_answers, fetch_archive_from_http -import pandas as pd -from haystack.pipelines import Pipeline - def basic_faq_pipeline(): document_store = ElasticsearchDocumentStore( host="localhost", username="", password="", - index="document", + index="example-document", embedding_field="question_emb", embedding_dim=384, excluded_meta_data=["question_emb"], @@ -52,6 +54,7 @@ def basic_faq_pipeline(): # Convert Dataframe to list of dicts and index them in our DocumentStore docs_to_index = df.to_dict(orient="records") document_store.write_documents(docs_to_index) + document_store.update_embeddings(retriever) # Initialize a Pipeline (this time without a reader) and ask questions pipeline = Pipeline() @@ -62,6 +65,9 @@ def basic_faq_pipeline(): prediction = pipeline.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}}) print_answers(prediction, details="medium") + + # Remove the index once we're done to save space + document_store.delete_index(index="example-document") return prediction diff --git a/examples/basic_qa_pipeline.py b/examples/basic_qa_pipeline.py index 9e90114d0..97988627e 100644 --- a/examples/basic_qa_pipeline.py +++ b/examples/basic_qa_pipeline.py @@ -1,21 +1,23 @@ +# Disable pylint errors for logging basicConfig +# pylint: disable=no-logging-basicconfig import logging from pathlib import Path +from haystack.document_stores import ElasticsearchDocumentStore +from haystack.nodes import BM25Retriever, FARMReader +from haystack.nodes.file_classifier import FileTypeClassifier +from haystack.nodes.file_converter import TextConverter +from haystack.nodes.preprocessor import PreProcessor +from haystack.pipelines import Pipeline +from haystack.utils import fetch_archive_from_http, launch_es, print_answers + logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) logging.getLogger("haystack").setLevel(logging.INFO) -from haystack.document_stores import ElasticsearchDocumentStore -from haystack.utils import fetch_archive_from_http, print_answers, launch_es -from haystack.nodes import FARMReader, BM25Retriever -from haystack.nodes.file_classifier import FileTypeClassifier -from haystack.nodes.preprocessor import PreProcessor -from haystack.nodes.file_converter import TextConverter -from haystack.pipelines import Pipeline - def basic_qa_pipeline(): # Initialize a DocumentStore - document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") + document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="example-document") # fetch, pre-process and write documents doc_dir = "data/basic_qa_pipeline" @@ -66,6 +68,9 @@ def basic_qa_pipeline(): ) print_answers(prediction, details="minimum") + + # Remove the index once we're done to save space + document_store.delete_index(index="example-document") return prediction diff --git a/examples/hybrid_search_faq_pipeline.py b/examples/hybrid_search_faq_pipeline.py index 2e2a86aca..d4fcba6cf 100644 --- a/examples/hybrid_search_faq_pipeline.py +++ b/examples/hybrid_search_faq_pipeline.py @@ -55,6 +55,7 @@ def hybrid_search_faq_pipeline(): # Convert Dataframe to list of dicts and index them in our DocumentStore docs_to_index = df.to_dict(orient="records") document_store.write_documents(docs_to_index) + document_store.update_embeddings(retriever=dense_retriever) # Initialize a Pipeline (this time without a reader) and ask questions pipeline = Pipeline() diff --git a/examples/test_basic_faq_pipeline.py b/examples/test_basic_faq_pipeline.py index cb038eff1..b637ad722 100644 --- a/examples/test_basic_faq_pipeline.py +++ b/examples/test_basic_faq_pipeline.py @@ -1,6 +1,6 @@ from examples.basic_faq_pipeline import basic_faq_pipeline -from haystack.schema import Answer, Document +from haystack.schema import Answer def test_basic_faq_pipeline(): diff --git a/examples/test_getting_started.py b/examples/test_getting_started.py index 657577c9f..ba33e9755 100644 --- a/examples/test_getting_started.py +++ b/examples/test_getting_started.py @@ -6,8 +6,7 @@ from examples.getting_started import getting_started from haystack.schema import Answer, Document -@pytest.mark.integration -@pytest.mark.parametrize("provider", ["cohere", "huggingface", "openai"]) +@pytest.mark.parametrize("provider", ["anthropic", "cohere", "huggingface", "openai"]) def test_getting_started(provider): if provider == "anthropic": api_key = os.environ.get("ANTHROPIC_API_KEY", "") @@ -17,9 +16,11 @@ def test_getting_started(provider): api_key = os.environ.get("HUGGINGFACE_API_KEY", "") elif provider == "openai": api_key = os.environ.get("OPENAI_API_KEY", "") - result = getting_started(provider=provider, API_KEY=api_key) - # Testing only for functionality. Since model predictions from APIs might change, we cannot test those directly. - assert isinstance(result, dict) - assert type(result["answers"][0]) == Answer - assert type(result["documents"][0]) == Document + if api_key: + result = getting_started(provider=provider, API_KEY=api_key) + + # Testing only for functionality. Since model predictions from APIs might change, we cannot test those directly. + assert isinstance(result, dict) + assert type(result["answers"][0]) == Answer + assert type(result["documents"][0]) == Document diff --git a/releasenotes/notes/hybrid_search_faq_pipeline.py-815df846dca7e872.yaml b/releasenotes/notes/hybrid_search_faq_pipeline.py-815df846dca7e872.yaml new file mode 100644 index 000000000..660f7d692 --- /dev/null +++ b/releasenotes/notes/hybrid_search_faq_pipeline.py-815df846dca7e872.yaml @@ -0,0 +1,4 @@ +--- +fixes: + - | + Added documents_store.update_embeddings call to pipeline examples so that embeddings are calculated for newly added documents.