Adding Example Scripts to Haystack (#3588)

* add 2 example scripts * fixing faq script * updating PR based on comments * black * updating s3 buckets * first attempt at testing * Add basic tests to two scripts PR: #3588 * make tests runnable * reformat files * only run in PRs touching an example Co-authored-by: bilgeyucel <bilgeyucel96@gmail.com> Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
2025-12-12 15:27:06 +00:00 · 2023-01-27 14:54:59 +01:00 · 2023-01-27 14:54:59 +01:00 · e1502c8029
commit e1502c8029
parent f6a99b6ebc
6 changed files with 229 additions and 0 deletions
--- a/.github/workflows/examples-tests.yml
+++ b/.github/workflows/examples-tests.yml
@ -0,0 +1,40 @@
+name: Examples tests
+
+on:
+  workflow_dispatch: # Activate this workflow manually
+  push:
+    branches:
+      - main
+  pull_request:
+    paths:
+      - examples/**
+    types:
+      - opened
+      - reopened
+      - synchronize
+      - ready_for_review
+
+env:
+  SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
+
+jobs:
+  tests:
+      name: Examples
+      runs-on: ubuntu-latest
+      steps:
+        - uses: actions/checkout@v3
+
+        - name: Setup Python
+          uses: ./.github/actions/python_cache/
+
+        - name: Install Haystack
+          run: pip install .[all]
+
+        - name: Run
+          run: pytest examples/
+
+        - uses: act10ns/slack@v1
+          with:
+            status: ${{ job.status }}
+            channel: '#haystack'
+          if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main'
--- a/examples/init.py
+++ b/examples/init.py
--- a/examples/basic_faq_pipeline.py
+++ b/examples/basic_faq_pipeline.py
@ -0,0 +1,71 @@
+import logging
+
+logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
+logging.getLogger("haystack").setLevel(logging.INFO)
+
+from haystack.document_stores import ElasticsearchDocumentStore
+
+from haystack.nodes import EmbeddingRetriever
+from haystack.nodes.other.docs2answers import Docs2Answers
+from haystack.utils import launch_es, print_answers, fetch_archive_from_http
+import pandas as pd
+from haystack.pipelines import Pipeline
+
+
+def basic_faq_pipeline():
+
+    launch_es()
+    document_store = ElasticsearchDocumentStore(
+        host="localhost",
+        username="",
+        password="",
+        index="document",
+        embedding_field="question_emb",
+        embedding_dim=384,
+        excluded_meta_data=["question_emb"],
+        similarity="cosine",
+    )
+
+    retriever = EmbeddingRetriever(
+        document_store=document_store,
+        embedding_model="sentence-transformers/all-MiniLM-L6-v2",
+        use_gpu=True,
+        scale_score=False,
+    )
+
+    doc_to_answers = Docs2Answers()
+
+    doc_dir = "data/basic_faq_pipeline"
+    s3_url = "https://core-engineering.s3.eu-central-1.amazonaws.com/public/scripts/small_faq_covid.csv1.zip"
+    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
+
+    df = pd.read_csv(f"{doc_dir}/small_faq_covid.csv")
+
+    # Minimal cleaning
+    df.fillna(value="", inplace=True)
+    df["question"] = df["question"].apply(lambda x: x.strip())
+    print(df.head())
+
+    # Get embeddings for our questions from the FAQs
+    questions = list(df["question"].values)
+    df["question_emb"] = retriever.embed_queries(queries=questions).tolist()
+    df = df.rename(columns={"question": "content"})
+
+    # Convert Dataframe to list of dicts and index them in our DocumentStore
+    docs_to_index = df.to_dict(orient="records")
+    document_store.write_documents(docs_to_index)
+
+    # Initialize a Pipeline (this time without a reader) and ask questions
+    pipeline = Pipeline()
+    pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
+    pipeline.add_node(component=doc_to_answers, name="Docs2Answers", inputs=["Retriever"])
+
+    # Ask a question
+    prediction = pipeline.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}})
+
+    print_answers(prediction, details="medium")
+    return prediction
+
+
+if __name__ == "__main__":
+    basic_faq_pipeline()
--- a/examples/basic_qa_pipeline.py
+++ b/examples/basic_qa_pipeline.py
@ -0,0 +1,76 @@
+import logging
+from pathlib import Path
+
+logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
+logging.getLogger("haystack").setLevel(logging.INFO)
+
+from haystack.document_stores import ElasticsearchDocumentStore
+from haystack.utils import fetch_archive_from_http, print_answers, launch_es
+from haystack.nodes import FARMReader, BM25Retriever
+from haystack.nodes.file_classifier import FileTypeClassifier
+from haystack.nodes.preprocessor import PreProcessor
+from haystack.nodes.file_converter import TextConverter
+from haystack.pipelines import Pipeline
+
+
+def basic_qa_pipeline():
+    # launch and create DocumentStore
+    launch_es()
+
+    # Initialize a DocumentStore
+    document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
+
+    # fetch, pre-process and write documents
+    doc_dir = "data/basic_qa_pipeline"
+    s3_url = "https://core-engineering.s3.eu-central-1.amazonaws.com/public/scripts/wiki_gameofthrones_txt1.zip"
+    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
+
+    file_paths = [p for p in Path(doc_dir).glob("**/*")]
+    files_metadata = [{"name": path.name} for path in file_paths]
+
+    # Indexing Pipeline
+    indexing_pipeline = Pipeline()
+
+    # Makes sure the file is a TXT file (FileTypeClassifier node)
+    classifier = FileTypeClassifier()
+    indexing_pipeline.add_node(classifier, name="Classifier", inputs=["File"])
+
+    # Converts a file into text and performs basic cleaning (TextConverter node)
+    text_converter = TextConverter(remove_numeric_tables=True)
+    indexing_pipeline.add_node(text_converter, name="Text_converter", inputs=["Classifier.output_1"])
+
+    # - Pre-processes the text by performing splits and adding metadata to the text (Preprocessor node)
+    preprocessor = PreProcessor(
+        clean_whitespace=True,
+        clean_empty_lines=True,
+        split_length=100,
+        split_overlap=50,
+        split_respect_sentence_boundary=True,
+    )
+    indexing_pipeline.add_node(preprocessor, name="Preprocessor", inputs=["Text_converter"])
+
+    # - Writes the resulting documents into the document store
+    indexing_pipeline.add_node(document_store, name="Document_Store", inputs=["Preprocessor"])
+
+    # Then we run it with the documents and their metadata as input
+    indexing_pipeline.run(file_paths=file_paths, meta=files_metadata)
+
+    # Initialize Retriever & Reader
+    retriever = BM25Retriever(document_store=document_store)
+    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
+
+    # Query Pipeline
+    pipeline = Pipeline()
+    pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
+    pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])
+
+    prediction = pipeline.run(
+        query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
+    )
+
+    print_answers(prediction, details="minimum")
+    return prediction
+
+
+if __name__ == "__main__":
+    basic_qa_pipeline()
--- a/examples/test_basic_faq_pipeline.py
+++ b/examples/test_basic_faq_pipeline.py
@ -0,0 +1,19 @@
+from examples.basic_faq_pipeline import basic_faq_pipeline
+
+from haystack.schema import Answer, Document
+
+
+def test_basic_faq_pipeline():
+    prediction = basic_faq_pipeline()
+
+    assert prediction is not None
+    assert prediction["query"] == "How is the virus spreading?"
+
+    assert len(prediction["answers"]) == 10  # top-k of Retriever
+    assert type(prediction["answers"][0]) == Answer
+    assert (
+        prediction["answers"][0].answer
+        == """This virus was first detected in Wuhan City, Hubei Province, China. The first infections were linked to a live animal market, but the virus is now spreading from person-to-person. It’s important to note that person-to-person spread can happen on a continuum. Some viruses are highly contagious (like measles), while other viruses are less so.\n\nThe virus that causes COVID-19 seems to be spreading easily and sustainably in the community (“community spread”) in some affected geographic areas. Community spread means people have been infected with the virus in an area, including some who are not sure how or where they became infected.\n\nLearn what is known about the spread of newly emerged coronaviruses."""
+    )
+    assert prediction["answers"][0].score <= 1
+    assert prediction["answers"][0].score >= 0
--- a/examples/test_basic_qa_pipeline.py
+++ b/examples/test_basic_qa_pipeline.py
@ -0,0 +1,23 @@
+from examples.basic_qa_pipeline import basic_qa_pipeline
+
+from haystack.schema import Answer, Document
+
+
+def test_basic_qa_pipeline():
+    prediction = basic_qa_pipeline()
+
+    assert prediction is not None
+    assert prediction["query"] == "Who is the father of Arya Stark?"
+
+    assert len(prediction["answers"]) == 5  # top-k of Reader
+    assert type(prediction["answers"][0]) == Answer
+    assert prediction["answers"][0].answer == "Ned"
+    assert prediction["answers"][0].score <= 1
+    assert prediction["answers"][0].score >= 0
+    assert prediction["answers"][0].meta["name"] == "43_Arya_Stark.txt"
+
+    assert len(prediction["documents"]) == 10  # top-k of Retriever
+    assert type(prediction["documents"][0]) == Document
+    assert prediction["documents"][0].score <= 1
+    assert prediction["documents"][0].score >= 0
+    assert prediction["documents"][0].meta["name"] == "450_Baelor.txt"