haystack/tutorials/Tutorial14_Query_Classifier.py

189 lines
8.1 KiB
Python
Raw Normal View History

import logging
# We configure how logging messages should be displayed and which log level should be used before importing Haystack.
# Example log message:
# INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt
# Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily:
logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)
from haystack.utils import (
fetch_archive_from_http,
convert_files_to_docs,
clean_wiki_text,
launch_es,
print_answers,
print_documents,
)
from haystack.pipelines import Pipeline
Refactoring of the `haystack` package (#1624) * Files moved, imports all broken * Fix most imports and docstrings into * Fix the paths to the modules in the API docs * Add latest docstring and tutorial changes * Add a few pipelines that were lost in the inports * Fix a bunch of mypy warnings * Add latest docstring and tutorial changes * Create a file_classifier module * Add docs for file_classifier * Fixed most circular imports, now the REST API can start * Add latest docstring and tutorial changes * Tackling more mypy issues * Reintroduce from FARM and fix last mypy issues hopefully * Re-enable old-style imports * Fix some more import from the top-level package in an attempt to sort out circular imports * Fix some imports in tests to new-style to prevent failed class equalities from breaking tests * Change document_store into document_stores * Update imports in tutorials * Add latest docstring and tutorial changes * Probably fixes summarizer tests * Improve the old-style import allowing module imports (should work) * Try to fix the docs * Remove dedicated KnowledgeGraph page from autodocs * Remove dedicated GraphRetriever page from autodocs * Fix generate_docstrings.sh with an updated list of yaml files to look for * Fix some more modules in the docs * Fix the document stores docs too * Fix a small issue on Tutorial14 * Add latest docstring and tutorial changes * Add deprecation warning to old-style imports * Remove stray folder and import Dict into dense.py * Change import path for MLFlowLogger * Add old loggers path to the import path aliases * Fix debug output of convert_ipynb.py * Fix circular import on BaseRetriever * Missed one merge block * re-run tutorial 5 * Fix imports in tutorial 5 * Re-enable squad_to_dpr CLI from the root package and move get_batches_from_generator into document_stores.base * Add latest docstring and tutorial changes * Fix typo in utils __init__ * Fix a few more imports * Fix benchmarks too * New-style imports in test_knowledge_graph * Rollback setup.py * Rollback squad_to_dpr too Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2021-10-25 15:50:23 +02:00
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import (
BM25Retriever,
EmbeddingRetriever,
FARMReader,
TransformersQueryClassifier,
SklearnQueryClassifier,
)
import pandas as pd
def tutorial14_query_classifier():
"""Tutorial 14: Query Classifiers"""
# Useful for framing headers
def print_header(header):
equal_line = "=" * len(header)
print(f"\n{equal_line}\n{header}\n{equal_line}\n")
# Try out the SklearnQueryClassifier on its own
# Keyword vs. Question/Statement Classification
keyword_classifier = SklearnQueryClassifier()
queries = [
"Arya Stark father", # Keyword Query
"Who was the father of Arya Stark", # Interrogative Query
"Lord Eddard was the father of Arya Stark", # Statement Query
]
k_vs_qs_results = {"Query": [], "Output Branch": [], "Class": []}
for query in queries:
result = keyword_classifier.run(query=query)
k_vs_qs_results["Query"].append(query)
k_vs_qs_results["Output Branch"].append(result[1])
k_vs_qs_results["Class"].append("Question/Statement" if result[1] == "output_1" else "Keyword")
print_header("Keyword vs. Question/Statement Classification")
print(pd.DataFrame.from_dict(k_vs_qs_results))
print("")
# Question vs. Statement Classification
model_url = (
"https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/model.pickle"
)
vectorizer_url = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/vectorizer.pickle"
question_classifier = SklearnQueryClassifier(model_name_or_path=model_url, vectorizer_name_or_path=vectorizer_url)
queries = [
"Who was the father of Arya Stark", # Interrogative Query
"Lord Eddard was the father of Arya Stark", # Statement Query
]
q_vs_s_results = {"Query": [], "Output Branch": [], "Class": []}
for query in queries:
result = question_classifier.run(query=query)
q_vs_s_results["Query"].append(query)
q_vs_s_results["Output Branch"].append(result[1])
q_vs_s_results["Class"].append("Question" if result[1] == "output_1" else "Statement")
print_header("Question vs. Statement Classification")
print(pd.DataFrame.from_dict(q_vs_s_results))
print("")
# Use in pipelines
# Download and prepare data - 517 Wikipedia articles for Game of Thrones
doc_dir = "data/tutorial14"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt14.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
# convert files to dicts containing documents that can be indexed to our datastore
got_docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
# Initialize DocumentStore and index documents
launch_es()
document_store = ElasticsearchDocumentStore()
document_store.delete_documents()
document_store.write_documents(got_docs)
# Pipelines with Keyword vs. Question/Statement Classification
print_header("PIPELINES WITH KEYWORD VS. QUESTION/STATEMENT CLASSIFICATION")
# Initialize sparse retriever for keyword queries
bm25_retriever = BM25Retriever(document_store=document_store)
# Initialize dense retriever for question/statement queries
embedding_retriever = EmbeddingRetriever(
document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1"
)
document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
# Pipeline 1: SklearnQueryClassifier
print_header("Pipeline 1: SklearnQueryClassifier")
sklearn_keyword_classifier = Pipeline()
sklearn_keyword_classifier.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"])
sklearn_keyword_classifier.add_node(
component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_1"]
)
sklearn_keyword_classifier.add_node(
component=bm25_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]
)
sklearn_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"])
sklearn_keyword_classifier.draw("sklearn_keyword_classifier.png")
# Run only the dense retriever on the full sentence query
res_1 = sklearn_keyword_classifier.run(query="Who is the father of Arya Stark?")
print_header("Question Query Results")
print_answers(res_1, details="minimum")
print("")
# Run only the sparse retriever on a keyword based query
res_2 = sklearn_keyword_classifier.run(query="arya stark father")
print_header("Keyword Query Results")
print_answers(res_2, details="minimum")
print("")
# Pipeline 2: TransformersQueryClassifier
print_header("Pipeline 2: TransformersQueryClassifier")
transformer_keyword_classifier = Pipeline()
transformer_keyword_classifier.add_node(
component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"]
)
transformer_keyword_classifier.add_node(
component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_1"]
)
transformer_keyword_classifier.add_node(
component=bm25_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]
)
transformer_keyword_classifier.add_node(
component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"]
)
# Run only the dense retriever on the full sentence query
res_1 = transformer_keyword_classifier.run(query="Who is the father of Arya Stark?")
print_header("Question Query Results")
print_answers(res_1, details="minimum")
print("")
# Run only the sparse retriever on a keyword based query
res_2 = transformer_keyword_classifier.run(query="arya stark father")
print_header("Keyword Query Results")
print_answers(res_2, details="minimum")
print("")
# Pipeline with Question vs. Statement Classification
print_header("PIPELINE WITH QUESTION VS. STATEMENT CLASSIFICATION")
transformer_question_classifier = Pipeline()
[CI refactoring] Tutorials on CI (#2547) * Experimental Ci workflow for running tutorials * Run on every push for now * Not starting? * Disabling paths temporarily * Sort tutorials in natural order * Install ipython * remove ipython install * Try running ipython with sudo * env.pythonLocation * Skipping tutorial2 and 9 for speed * typo * Use one runner per tutorial, for now * Typo in dependend job * Missing quotes broke scripts matrix * Simplify setup for the tutorials, try to prevent containers conflict * Remove needless job dependencies * Try prevent cache issues, fix small Tut10 bug * Missing deps for running notebook tutorials * Create three groups of tutorials excluding the longest among them * remove deps * use proper bash loop * Try with a single string * Fix typo in echo * Forgot do * Typo * Try to make the GraphDB tutorial without launching its own container * Run notebook and script together * Whitespace * separate scrpits and notebooks execution * Run notebooks first * Try caching the GoT data before running the scripts * add note * fix mkdir * Fix path * Update Documentation & Code Style * missing -r * Fix folder numbering * Run notebooks as well * Typo in notebook command * complete path in notebook command * Try with TIKA_LOG_PATH * Fix folder naming * Do not use cached data in Tut9 * extracting the number better * Small tweaks * Same fix on Tut10 on the notebook * Exclude GoT cache for tut5 too * Remove faiss files after tutorial run * Layout * fix remove command * Fix path in tut10 notebook * Fix typo in node name in tut14 * Third block was too long, rebancing * Reduce GoT dataset even more, why wasting time after all... * Fix paths in tut10 again * do git clean to make sure to cleanup everything (breaks post Python) * Remove ES file with bad permission at the end of the run * Split first block, takes >30mins * take out tut15 for a moment, has an actual bug * typo * Forgot rm option * Simply remove all ES files * Improve logs of GoT reduction * Exclude also tut16 from cache to try fix bug * Replace ll with ls * Reintroduce 15_TableQA * Small regrouping * regrouping to make the min num of runners go for about 30mins * Add cron schedule and PR paths conditions * Add some timing information * Separate tutorials by diff and tutorials by cron * temp add pull_request to tutorials nightly * Add badge in README to keep track of the nightly tutorials run * Remove prefixes from data folder names * Add fetch depth to get diff with master * Fix paths again * typo * Exclude long-running ones * Typo * Fix tutorials.yml as well * Use head_ref * Using an action for now * exclude other files * Use only the correct command to run the tutorial * Add long running tutorials in separate runners, just for experiment * Factor out the complex bash script * Pass the python path to the bash script * Fix paths * adding log statement * Missing dollarsign * Resetting variable in loop * using mini GoT dataset and improving bash script * change dataset name Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2022-06-15 09:53:36 +02:00
transformer_question_classifier.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"])
transformer_question_classifier.add_node(
component=TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier"),
name="QueryClassifier",
inputs=["EmbeddingRetriever"],
)
transformer_question_classifier.add_node(component=reader, name="QAReader", inputs=["QueryClassifier.output_1"])
transformer_question_classifier.draw("transformer_question_classifier.png")
# Run only the QA reader on the question query
res_1 = transformer_question_classifier.run(query="Who is the father of Arya Stark?")
print_header("Question Query Results")
print_answers(res_1, details="minimum")
print("")
res_2 = transformer_question_classifier.run(query="Arya Stark was the daughter of a Lord.")
print_header("Statement Query Results")
print_documents(res_2)
print("")
if __name__ == "__main__":
tutorial14_query_classifier()
# This Haystack script was made with love by deepset in Berlin, Germany
# Haystack: https://github.com/deepset-ai/haystack
# deepset: https://deepset.ai/