From d81897535e8306493eabb924e1ec4d8ffc4e4e46 Mon Sep 17 00:00:00 2001 From: Sara Zan Date: Fri, 19 Nov 2021 11:34:32 +0100 Subject: [PATCH] Public demo (#1747) * Queries now run only when pressing RUN. File upload hidden. Question is not sent if the textbox is empty. * Add latest docstring and tutorial changes * Tidy up: remove needless state, add comments, fix minor bugs * Had to add results to the status to avoid some bugs in eval mode * Added 'credits' * Add footers, update requirements, some random questions for the evaluation * Add requested changes * Temporary rollback the UI to the old GoT dataset Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- rest_api/controller/search.py | 6 + ui/requirements.txt | 4 +- ui/utils.py | 11 +- ui/webapp.py | 255 +++++++++++++++++++--------------- 4 files changed, 158 insertions(+), 118 deletions(-) diff --git a/rest_api/controller/search.py b/rest_api/controller/search.py index b9010bed3..e45557bfe 100644 --- a/rest_api/controller/search.py +++ b/rest_api/controller/search.py @@ -4,6 +4,7 @@ from pathlib import Path from fastapi import APIRouter +import haystack from haystack.pipelines.base import Pipeline from rest_api.config import PIPELINE_YAML_PATH, QUERY_PIPELINE_NAME from rest_api.config import LOG_LEVEL, CONCURRENT_REQUEST_PER_WORKER @@ -42,6 +43,11 @@ def check_status(): return True +@router.get("/hs_version") +def haystack_version(): + return {"hs_version": haystack.__version__} + + @router.post("/query", response_model=QueryResponse, response_model_exclude_none=True) def query(request: QueryRequest): with concurrency_limiter.run(): diff --git a/ui/requirements.txt b/ui/requirements.txt index 14d9d9fa3..76c407b5c 100644 --- a/ui/requirements.txt +++ b/ui/requirements.txt @@ -1,2 +1,2 @@ -streamlit>=0.84.0 -st-annotated-text==1.1.0 +streamlit>=1.2.0 +st-annotated-text==2.0.0 diff --git a/ui/utils.py b/ui/utils.py index 7cc18ad47..d2577cc6b 100644 --- a/ui/utils.py +++ b/ui/utils.py @@ -6,6 +6,7 @@ import streamlit as st API_ENDPOINT = os.getenv("API_ENDPOINT", "http://localhost:8000") STATUS = "initialized" +HS_VERSION = "hs_version" DOC_REQUEST = "query" DOC_FEEDBACK = "feedback" DOC_UPLOAD = "file-upload" @@ -20,8 +21,12 @@ def haystack_is_ready(): logging.exception(e) return False +@st.cache +def haystack_version(): + url = f"{API_ENDPOINT}/{HS_VERSION}" + return requests.get(url).json()["hs_version"] + -@st.cache(show_spinner=False) def retrieve_doc(query, filters=None, top_k_reader=5, top_k_retriever=5): # Query Haystack API url = f"{API_ENDPOINT}/{DOC_REQUEST}" @@ -31,6 +36,10 @@ def retrieve_doc(query, filters=None, top_k_reader=5, top_k_retriever=5): # Format response result = [] + + if "errors" in response_raw: + raise Exception(", ".join(response_raw["errors"])) + answers = response_raw["answers"] for i in range(len(answers)): answer = answers[i] diff --git a/ui/webapp.py b/ui/webapp.py index 68e16f3f7..5d84e196c 100644 --- a/ui/webapp.py +++ b/ui/webapp.py @@ -3,6 +3,7 @@ import sys import logging import pandas as pd +from pathlib import Path import streamlit as st from annotated_text import annotated_text @@ -10,177 +11,201 @@ from annotated_text import annotated_text # and every value gets lost. To keep track of our feedback state we use the official streamlit gist mentioned # here https://gist.github.com/tvst/036da038ab3e999a64497f42de966a92 import SessionState -from utils import feedback_doc, haystack_is_ready, retrieve_doc, upload_doc +from utils import HS_VERSION, feedback_doc, haystack_is_ready, retrieve_doc, upload_doc, haystack_version + # Adjust to a question that you would like users to see in the search bar when they load the UI: -DEFAULT_QUESTION_AT_STARTUP = "Who is the father of Arya Stark?" +DEFAULT_QUESTION_AT_STARTUP = "Who's the father of Arya Stark?" +# Labels for the evaluation +EVAL_LABELS = os.getenv("EVAL_FILE", Path(__file__).parent / "eval_labels_example.csv") -def annotate_answer(answer, context): - """ If we are using an extractive QA pipeline, we'll get answers - from the API that we highlight in the given context""" - start_idx = context.find(answer) - end_idx = start_idx + len(answer) - # calculate dynamic height depending on context length - height = int(len(context) * 0.50) + 5 - annotated_text(context[:start_idx], (answer, "ANSWER", "#8ef"), context[end_idx:], height=height) +# Whether the file upload should be enabled or not +DISABLE_FILE_UPLOAD = os.getenv("HAYSTACK_UI_DISABLE_FILE_UPLOAD") - -def show_plain_documents(text): - """ If we are using a plain document search pipeline, i.e. only retriever, we'll get plain documents - from the API that we just show without any highlighting""" - st.markdown(text) - - -def random_questions(df): - """ - Helper to get one random question + gold random_answer from the user's CSV 'eval_labels_example'. - This can then be shown in the UI when the evaluation mode is selected. Users can easily give feedback on the - model's results and "enrich" the eval dataset with more acceptable labels - """ - random_row = df.sample(1) - random_question = random_row["Question Text"].values[0] - random_answer = random_row["Answer"].values[0] - return random_question, random_answer +# Retrieve Haystack version from the REST API +HS_VERSION = haystack_version() def main(): - # Define state - state_question = SessionState.get( - random_question=DEFAULT_QUESTION_AT_STARTUP, random_answer="", next_question="false", run_query="false" + + # Persistent state + state = SessionState.get( + random_question=DEFAULT_QUESTION_AT_STARTUP, + random_answer="", + results=None, + raw_json=None, + get_next_question=True ) - # Initialize variables - eval_mode = False - random_question = DEFAULT_QUESTION_AT_STARTUP - eval_labels = os.getenv("EVAL_FILE", "eval_labels_example.csv") + # Small callback to reset the interface in case the text of the question changes + def reset_results(*args): + state.results = None + state.raw_json = None - # UI search bar and sidebar + # Title st.write("# Haystack Demo") + + # Sidebar st.sidebar.header("Options") top_k_reader = st.sidebar.slider("Max. number of answers", min_value=1, max_value=10, value=3, step=1) - top_k_retriever = st.sidebar.slider( - "Max. number of documents from retriever", min_value=1, max_value=10, value=3, step=1 - ) + top_k_retriever = st.sidebar.slider("Max. number of documents from retriever", min_value=1, max_value=10, value=3, step=1) eval_mode = st.sidebar.checkbox("Evaluation mode") debug = st.sidebar.checkbox("Show debug info") - st.sidebar.write("## File Upload:") - data_files = st.sidebar.file_uploader("", type=["pdf", "txt", "docx"], accept_multiple_files=True) - for data_file in data_files: - # Upload file - if data_file: - raw_json = upload_doc(data_file) - st.sidebar.write(raw_json) - if debug: - st.subheader("REST API JSON response") - st.sidebar.write(raw_json) + # File upload block + if not DISABLE_FILE_UPLOAD: + st.sidebar.write("## File Upload:") + data_files = st.sidebar.file_uploader("", type=["pdf", "txt", "docx"], accept_multiple_files=True) + for data_file in data_files: + # Upload file + if data_file: + raw_json = upload_doc(data_file) + st.sidebar.write(str(data_file.name) + "    ✅ ") + if debug: + st.subheader("REST API JSON response") + st.sidebar.write(raw_json) - # load csv into pandas dataframe + st.sidebar.markdown(f""" + + + """, unsafe_allow_html=True) + + # Load csv into pandas dataframe if eval_mode: try: - df = pd.read_csv(eval_labels, sep=";") + df = pd.read_csv(EVAL_LABELS, sep=";") except Exception: - sys.exit("The eval file was not found. Please check the README for more information.") - if ( - state_question - and hasattr(state_question, "next_question") - and hasattr(state_question, "random_question") - and state_question.next_question - ): - random_question = state_question.random_question - random_answer = state_question.random_answer - else: - random_question, random_answer = random_questions(df) - state_question.random_question = random_question - state_question.random_answer = random_answer + st.error(f"The eval file was not found. Please check the demo's [README](https://github.com/deepset-ai/haystack/tree/master/ui/README.md) for more information.") + sys.exit(f"The eval file was not found under `{EVAL_LABELS}`. Please check the README (https://github.com/deepset-ai/haystack/tree/master/ui/README.md) for more information.") - # Get next random question from the CSV - if eval_mode: - next_question = st.button("Load new question") - if next_question: - random_question, random_answer = random_questions(df) - state_question.random_question = random_question - state_question.random_answer = random_answer - state_question.next_question = True - state_question.run_query = False - else: - state_question.next_question = False + # Get next random question from the CSV + state.get_next_question = st.button("Load new question") + if state.get_next_question: + reset_results() + new_row = df.sample(1) + while new_row["Question Text"].values[0] == state.random_question: # Avoid picking the same question twice (the change is not visible on the UI) + new_row = df.sample(1) + state.random_question = new_row["Question Text"].values[0] + state.random_answer = new_row["Answer"].values[0] # Search bar - question = st.text_input("Please provide your query:", value=random_question) - if state_question and state_question.run_query: - run_query = state_question.run_query - st.button("Run") - else: - run_query = st.button("Run") - state_question.run_query = run_query - - raw_json_feedback = "" + question = st.text_input( + "Please provide your query:", + value=state.random_question, + max_chars=100, + on_change=reset_results + ) + run_query = st.button("Run") + # Check the connection with st.spinner("⌛️    Haystack is starting..."): if not haystack_is_ready(): st.error("🚫    Connection Error. Is Haystack running?") run_query = False + reset_results() # Get results for query - if run_query: + if run_query and question: + reset_results() with st.spinner( "🧠    Performing neural search on documents... \n " "Do you want to optimize speed or accuracy? \n" "Check out the docs: https://haystack.deepset.ai/usage/optimization " ): try: - results, raw_json = retrieve_doc(question, top_k_reader=top_k_reader, top_k_retriever=top_k_retriever) + state.results, state.raw_json = retrieve_doc(question, top_k_reader=top_k_reader, top_k_retriever=top_k_retriever) except Exception as e: logging.exception(e) - st.error("🐞    An error occurred during the request. Check the logs in the console to know more.") + if "The server is busy processing requests" in str(e): + st.error("🧑‍🌾    All our workers are busy! Try again later.") + else: + st.error("🐞    An error occurred during the request. Check the logs in the console to know more.") return - # Show if we use a question of the given set - if question == random_question and eval_mode: + if state.results: + + # Show the gold answer if we use a question of the given set + if question == state.random_question and eval_mode: st.write("## Correct answers:") - random_answer + st.write(state.random_answer) st.write("## Results:") + count = 0 # Make every button key unique - # Make every button key unique - count = 0 - - for result in results: + for result in state.results: if result["answer"]: - annotate_answer(result["answer"], result["context"]) + answer, context = result["answer"], result["context"] + start_idx = context.find(answer) + end_idx = start_idx + len(answer) + annotated_text(context[:start_idx], (answer, "ANSWER", "#8ef"), context[end_idx:]) else: - show_plain_documents(result["context"]) + st.markdown(result["context"]) + st.write("**Relevance:** ", result["relevance"], "**Source:** ", result["source"]) if eval_mode: # Define columns for buttons - button_col1, button_col2, button_col3, button_col4 = st.columns([1, 1, 1, 6]) - if button_col1.button("👍", key=(result["context"] + str(count) + "1"), help="Correct answer"): - raw_json_feedback = feedback_doc( - question, "true", result["document_id"], 1, "true", result["answer"], result["offset_start_in_doc"] + button_col1, button_col2, button_col3, _ = st.columns([1, 1, 1, 6]) + if button_col1.button("👍", key=f"{result['context']}{count}1", help="Correct answer"): + feedback_doc( + question=question, + is_correct_answer="true", + document_id=result["document_id"], + model_id=1, + is_correct_document="true", + answer=result["answer"], + offset_start_in_doc=result["offset_start_in_doc"] ) - st.success("Thanks for your feedback") - if button_col2.button("👎", key=(result["context"] + str(count) + "2"), help="Wrong answer and wrong passage"): - raw_json_feedback = feedback_doc( - question, - "false", - result["document_id"], - 1, - "false", - result["answer"], - result["offset_start_in_doc"], + st.success("✨    Thanks for your feedback!    ✨") + + if button_col2.button("👎", key=f"{result['context']}{count}2", help="Wrong answer and wrong passage"): + feedback_doc( + question=question, + is_correct_answer="false", + document_id=result["document_id"], + model_id=1, + is_correct_document="false", + answer=result["answer"], + offset_start_in_doc=result["offset_start_in_doc"] ) - st.success("Thanks for your feedback!") - if button_col3.button("👎👍", key=(result["context"] + str(count) + "3"), help="Wrong answer, but correct passage"): - raw_json_feedback = feedback_doc( - question, "false", result["document_id"], 1, "true", result["answer"], result["offset_start_in_doc"] + st.success("✨    Thanks for your feedback!    ✨") + + if button_col3.button("👎👍", key=f"{result['context']}{count}3", help="Wrong answer, but correct passage"): + feedback_doc( + question=question, + is_correct_answer="false", + document_id=result["document_id"], + model_id=1, + is_correct_document="true", + answer=result["answer"], + offset_start_in_doc=result["offset_start_in_doc"] ) - st.success("Thanks for your feedback!") + st.success("✨    Thanks for your feedback!    ✨") count += 1 st.write("___") + if debug: st.subheader("REST API JSON response") - st.write(raw_json) + st.write(state.raw_json) + main()