Tutorial 14 edit (#2663)

* Rewrite Tutorial 14 for increased user-friendliness * Update Tutorial14 .py file to match .ipynb file * Update Documentation & Code Style * unblock the ci * ignore error in jitterbit/get-changed-files Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Sara Zan <sarazanzo94@gmail.com>
2025-12-02 18:06:35 +00:00 · 2022-06-22 13:03:07 +02:00 · 2022-06-22 13:03:07 +02:00 · b87c0c950b
commit b87c0c950b
parent 325bc5466a
3 changed files with 368 additions and 6590 deletions
--- a/.github/workflows/tutorials.yml
+++ b/.github/workflows/tutorials.yml
@ -65,6 +65,7 @@ jobs:

    - uses: jitterbit/get-changed-files@v1
      id: diff
+      continue-on-error: true
      with:
        format: space-delimited
        token: ${{ secrets.GITHUB_TOKEN }}
--- a/tutorials/Tutorial14_Query_Classifier.ipynb
+++ b/tutorials/Tutorial14_Query_Classifier.ipynb
--- a/tutorials/Tutorial14_Query_Classifier.py
+++ b/tutorials/Tutorial14_Query_Classifier.py
@ -15,10 +15,56 @@ from haystack.nodes import (
    TransformersQueryClassifier,
    SklearnQueryClassifier,
 )
+import pandas as pd


 def tutorial14_query_classifier():
+    """Tutorial 14: Query Classifiers"""

+    # Useful for framing headers
+    def print_header(header):
+        equal_line = "=" * len(header)
+        print(f"\n{equal_line}\n{header}\n{equal_line}\n")
+
+    # Try out the SklearnQueryClassifier on its own
+    # Keyword vs. Question/Statement Classification
+    keyword_classifier = SklearnQueryClassifier()
+    queries = [
+        "Arya Stark father",  # Keyword Query
+        "Who was the father of Arya Stark",  # Interrogative Query
+        "Lord Eddard was the father of Arya Stark",  # Statement Query
+    ]
+    k_vs_qs_results = {"Query": [], "Output Branch": [], "Class": []}
+    for query in queries:
+        result = keyword_classifier.run(query=query)
+        k_vs_qs_results["Query"].append(query)
+        k_vs_qs_results["Output Branch"].append(result[1])
+        k_vs_qs_results["Class"].append("Question/Statement" if result[1] == "output_1" else "Keyword")
+    print_header("Keyword vs. Question/Statement Classification")
+    print(pd.DataFrame.from_dict(k_vs_qs_results))
+    print("")
+
+    # Question vs. Statement Classification
+    model_url = (
+        "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/model.pickle"
+    )
+    vectorizer_url = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/vectorizer.pickle"
+    question_classifier = SklearnQueryClassifier(model_name_or_path=model_url, vectorizer_name_or_path=vectorizer_url)
+    queries = [
+        "Who was the father of Arya Stark",  # Interrogative Query
+        "Lord Eddard was the father of Arya Stark",  # Statement Query
+    ]
+    q_vs_s_results = {"Query": [], "Output Branch": [], "Class": []}
+    for query in queries:
+        result = question_classifier.run(query=query)
+        q_vs_s_results["Query"].append(query)
+        q_vs_s_results["Output Branch"].append(result[1])
+        q_vs_s_results["Class"].append("Question" if result[1] == "output_1" else "Statement")
+    print_header("Question vs. Statement Classification")
+    print(pd.DataFrame.from_dict(q_vs_s_results))
+    print("")
+
+    # Use in pipelines
    # Download and prepare data - 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/tutorial14"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt14.zip"
@ -33,10 +79,13 @@ def tutorial14_query_classifier():
    document_store.delete_documents()
    document_store.write_documents(got_docs)

-    # Initialize Sparse retriever
+    # Pipelines with Keyword vs. Question/Statement Classification
+    print_header("PIPELINES WITH KEYWORD VS. QUESTION/STATEMENT CLASSIFICATION")
+
+    # Initialize sparse retriever for keyword queries
    bm25_retriever = BM25Retriever(document_store=document_store)

-    # Initialize dense retriever
+    # Initialize dense retriever for question/statement queries
    embedding_retriever = EmbeddingRetriever(
        document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1"
    )
@ -44,10 +93,8 @@ def tutorial14_query_classifier():

    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

-    print()
-    print("Sklearn keyword classifier")
-    print("==========================")
-    # Here we build the pipeline
+    # Pipeline 1: SklearnQueryClassifier
+    print_header("Pipeline 1: SklearnQueryClassifier")
    sklearn_keyword_classifier = Pipeline()
    sklearn_keyword_classifier.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"])
    sklearn_keyword_classifier.add_node(
@ -57,48 +104,23 @@ def tutorial14_query_classifier():
        component=bm25_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]
    )
    sklearn_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"])
-    sklearn_keyword_classifier.draw("pipeline_classifier.png")
+    sklearn_keyword_classifier.draw("sklearn_keyword_classifier.png")

    # Run only the dense retriever on the full sentence query
    res_1 = sklearn_keyword_classifier.run(query="Who is the father of Arya Stark?")
-    print("\n===============================")
-    print("Embedding Retriever Results" + "\n" + "=" * 15)
+    print_header("Question Query Results")
    print_answers(res_1, details="minimum")
+    print("")

    # Run only the sparse retriever on a keyword based query
    res_2 = sklearn_keyword_classifier.run(query="arya stark father")
-    print("\n===============================")
-    print("ES Results" + "\n" + "=" * 15)
+    print_header("Keyword Query Results")
    print_answers(res_2, details="minimum")
+    print("")

-    # Run only the dense retriever on the full sentence query
-    res_3 = sklearn_keyword_classifier.run(query="which country was jon snow filmed ?")
-    print("\n===============================")
-    print("Embedding Retriever Results" + "\n" + "=" * 15)
-    print_answers(res_3, details="minimum")
+    # Pipeline 2: TransformersQueryClassifier
+    print_header("Pipeline 2: TransformersQueryClassifier")

-    # Run only the sparse retriever on a keyword based query
-    res_4 = sklearn_keyword_classifier.run(query="jon snow country")
-    print("\n===============================")
-    print("ES Results" + "\n" + "=" * 15)
-    print_answers(res_4, details="minimum")
-
-    # Run only the dense retriever on the full sentence query
-    res_5 = sklearn_keyword_classifier.run(query="who are the younger brothers of arya stark ?")
-    print("\n===============================")
-    print("Embedding Retriever Results" + "\n" + "=" * 15)
-    print_answers(res_5, details="minimum")
-
-    # Run only the sparse retriever on a keyword based query
-    res_6 = sklearn_keyword_classifier.run(query="arya stark younger brothers")
-    print("\n===============================")
-    print("ES Results" + "\n" + "=" * 15)
-    print_answers(res_6, details="minimum")
-
-    print()
-    print("Transformer keyword classifier")
-    print("==============================")
-    # Here we build the pipeline
    transformer_keyword_classifier = Pipeline()
    transformer_keyword_classifier.add_node(
        component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"]
@ -112,49 +134,21 @@ def tutorial14_query_classifier():
    transformer_keyword_classifier.add_node(
        component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"]
    )
-    transformer_keyword_classifier.draw("pipeline_classifier.png")

    # Run only the dense retriever on the full sentence query
    res_1 = transformer_keyword_classifier.run(query="Who is the father of Arya Stark?")
-    print("\n===============================")
-    print("Embedding Retriever Results" + "\n" + "=" * 15)
+    print_header("Question Query Results")
    print_answers(res_1, details="minimum")
+    print("")

    # Run only the sparse retriever on a keyword based query
    res_2 = transformer_keyword_classifier.run(query="arya stark father")
-    print("\n===============================")
-    print("ES Results" + "\n" + "=" * 15)
+    print_header("Keyword Query Results")
    print_answers(res_2, details="minimum")
+    print("")

-    # Run only the dense retriever on the full sentence query
-    res_3 = transformer_keyword_classifier.run(query="which country was jon snow filmed ?")
-    print("\n===============================")
-    print("Embedding Retriever Results" + "\n" + "=" * 15)
-    print_answers(res_3, details="minimum")
-
-    # Run only the sparse retriever on a keyword based query
-    res_4 = transformer_keyword_classifier.run(query="jon snow country")
-    print("\n===============================")
-    print("ES Results" + "\n" + "=" * 15)
-    print_answers(res_4, details="minimum")
-
-    # Run only the dense retriever on the full sentence query
-    res_5 = transformer_keyword_classifier.run(query="who are the younger brothers of arya stark ?")
-    print("\n===============================")
-    print("Embedding Retriever Results" + "\n" + "=" * 15)
-    print_answers(res_5, details="minimum")
-
-    # Run only the sparse retriever on a keyword based query
-    res_6 = transformer_keyword_classifier.run(query="arya stark younger brothers")
-    print("\n===============================")
-    print("ES Results" + "\n" + "=" * 15)
-    print_answers(res_6, details="minimum")
-
-    print()
-    print("Transformer question classifier")
-    print("===============================")
-
-    # Here we build the pipeline
+    # Pipeline with Question vs. Statement Classification
+    print_header("PIPELINE WITH QUESTION VS. STATEMENT CLASSIFICATION")
    transformer_question_classifier = Pipeline()
    transformer_question_classifier.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"])
    transformer_question_classifier.add_node(
@ -163,58 +157,18 @@ def tutorial14_query_classifier():
        inputs=["EmbeddingRetriever"],
    )
    transformer_question_classifier.add_node(component=reader, name="QAReader", inputs=["QueryClassifier.output_1"])
-    transformer_question_classifier.draw("question_classifier.png")
+    transformer_question_classifier.draw("transformer_question_classifier.png")

    # Run only the QA reader on the question query
    res_1 = transformer_question_classifier.run(query="Who is the father of Arya Stark?")
-    print("\n===============================")
-    print("Embedding Retriever Results" + "\n" + "=" * 15)
+    print_header("Question Query Results")
    print_answers(res_1, details="minimum")
+    print("")

    res_2 = transformer_question_classifier.run(query="Arya Stark was the daughter of a Lord.")
-    print("\n===============================")
-    print("ES Results" + "\n" + "=" * 15)
+    print_header("Statement Query Results")
    print_documents(res_2)
-
-    # Here we create the keyword vs question/statement query classifier
-
-    queries = [
-        "arya stark father",
-        "jon snow country",
-        "who is the father of arya stark",
-        "which country was jon snow filmed?",
-    ]
-
-    keyword_classifier = TransformersQueryClassifier()
-
-    for query in queries:
-        result = keyword_classifier.run(query=query)
-        if result[1] == "output_1":
-            category = "question/statement"
-        else:
-            category = "keyword"
-
-        print(f"Query: {query}, raw_output: {result}, class: {category}")
-
-    # Here we create the question vs statement query classifier
-
-    queries = [
-        "Lord Eddard was the father of Arya Stark.",
-        "Jon Snow was filmed in United Kingdom.",
-        "who is the father of arya stark?",
-        "Which country was jon snow filmed in?",
-    ]
-
-    question_classifier = TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier")
-
-    for query in queries:
-        result = question_classifier.run(query=query)
-        if result[1] == "output_1":
-            category = "question"
-        else:
-            category = "statement"
-
-        print(f"Query: {query}, raw_output: {result}, class: {category}")
+    print("")


 if __name__ == "__main__":