2021-04-22 17:30:17 +02:00
import os
import sys
2021-04-30 14:16:30 +05:30
2021-11-22 19:06:08 +01:00
import html
2021-09-27 16:40:25 +02:00
import logging
2021-04-30 14:16:30 +05:30
import pandas as pd
2021-11-22 19:06:08 +01:00
from json import JSONDecodeError
2021-11-19 11:34:32 +01:00
from pathlib import Path
2020-12-27 18:06:09 +05:30
import streamlit as st
2021-11-22 19:06:08 +01:00
from annotated_text import annotation
from markdown import markdown
from htbuilder import H
2021-04-30 14:16:30 +05:30
# streamlit does not support any states out of the box. On every button click, streamlit reload the whole page
# and every value gets lost. To keep track of our feedback state we use the official streamlit gist mentioned
2021-04-22 17:30:17 +02:00
# here https://gist.github.com/tvst/036da038ab3e999a64497f42de966a92
import SessionState
2021-11-19 11:34:32 +01:00
from utils import HS_VERSION , feedback_doc , haystack_is_ready , retrieve_doc , upload_doc , haystack_version
2021-06-24 09:53:08 +02:00
2021-04-07 17:53:32 +02:00
2021-11-19 11:34:32 +01:00
# Adjust to a question that you would like users to see in the search bar when they load the UI:
DEFAULT_QUESTION_AT_STARTUP = " Who ' s the father of Arya Stark? "
2021-10-13 14:23:23 +02:00
2021-11-19 11:34:32 +01:00
# Labels for the evaluation
EVAL_LABELS = os . getenv ( " EVAL_FILE " , Path ( __file__ ) . parent / " eval_labels_example.csv " )
2021-06-24 09:53:08 +02:00
2021-11-19 11:34:32 +01:00
# Whether the file upload should be enabled or not
DISABLE_FILE_UPLOAD = os . getenv ( " HAYSTACK_UI_DISABLE_FILE_UPLOAD " )
2021-06-24 09:53:08 +02:00
2021-04-07 17:53:32 +02:00
2021-09-27 16:40:25 +02:00
def main ( ) :
2021-11-19 11:34:32 +01:00
# Persistent state
state = SessionState . get (
random_question = DEFAULT_QUESTION_AT_STARTUP ,
random_answer = " " ,
results = None ,
raw_json = None ,
get_next_question = True
2021-09-27 16:40:25 +02:00
)
2021-11-19 11:34:32 +01:00
# Small callback to reset the interface in case the text of the question changes
def reset_results ( * args ) :
state . results = None
state . raw_json = None
2021-09-27 16:40:25 +02:00
2021-11-19 11:34:32 +01:00
# Title
2021-09-27 16:40:25 +02:00
st . write ( " # Haystack Demo " )
2021-11-19 11:34:32 +01:00
# Sidebar
2021-09-27 16:40:25 +02:00
st . sidebar . header ( " Options " )
top_k_reader = st . sidebar . slider ( " Max. number of answers " , min_value = 1 , max_value = 10 , value = 3 , step = 1 )
2021-11-19 11:34:32 +01:00
top_k_retriever = st . sidebar . slider ( " Max. number of documents from retriever " , min_value = 1 , max_value = 10 , value = 3 , step = 1 )
2021-09-27 16:40:25 +02:00
eval_mode = st . sidebar . checkbox ( " Evaluation mode " )
debug = st . sidebar . checkbox ( " Show debug info " )
2021-11-19 11:34:32 +01:00
# File upload block
if not DISABLE_FILE_UPLOAD :
st . sidebar . write ( " ## File Upload: " )
data_files = st . sidebar . file_uploader ( " " , type = [ " pdf " , " txt " , " docx " ] , accept_multiple_files = True )
for data_file in data_files :
# Upload file
if data_file :
raw_json = upload_doc ( data_file )
st . sidebar . write ( str ( data_file . name ) + " ✅ " )
if debug :
st . subheader ( " REST API JSON response " )
st . sidebar . write ( raw_json )
2021-11-22 19:06:08 +01:00
hs_version = None
try :
hs_version = f " <small>(v { haystack_version ( ) } )</small> "
except Exception :
pass
2021-11-19 11:34:32 +01:00
st . sidebar . markdown ( f """
< style >
a { {
text - decoration : none ;
} }
. haystack - footer { {
text - align : center ;
} }
. haystack - footer h4 { {
margin : 0.1 rem ;
padding : 0 ;
} }
footer { {
opacity : 0 ;
} }
< / style >
< div class = " haystack-footer " >
< hr / >
2021-11-22 19:06:08 +01:00
< h4 > Built with < a href = " https://www.deepset.ai/haystack " > Haystack < / a > { hs_version } < / h4 >
2021-11-19 11:34:32 +01:00
< p > Get it on < a href = " https://github.com/deepset-ai/haystack/ " > GitHub < / a > & nbsp ; & nbsp ; - & nbsp ; & nbsp ; Read the < a href = " https://haystack.deepset.ai/overview/intro " > Docs < / a > < / p >
< small > Data crawled from < a href = " https://en.wikipedia.org/wiki/Category:Lists_of_countries_by_continent " > Wikipedia < / a > in November 2021. < br / > See the < a href = " https://creativecommons.org/licenses/by-sa/3.0/ " > License < / a > ( CC BY - SA 3.0 ) . < / small >
< / div >
""" , unsafe_allow_html=True)
# Load csv into pandas dataframe
2021-09-27 16:40:25 +02:00
if eval_mode :
try :
2021-11-19 11:34:32 +01:00
df = pd . read_csv ( EVAL_LABELS , sep = " ; " )
2021-09-27 16:40:25 +02:00
except Exception :
2021-11-19 11:34:32 +01:00
st . error ( f " The eval file was not found. Please check the demo ' s [README](https://github.com/deepset-ai/haystack/tree/master/ui/README.md) for more information. " )
sys . exit ( f " The eval file was not found under ` { EVAL_LABELS } `. Please check the README (https://github.com/deepset-ai/haystack/tree/master/ui/README.md) for more information. " )
# Get next random question from the CSV
state . get_next_question = st . button ( " Load new question " )
if state . get_next_question :
reset_results ( )
new_row = df . sample ( 1 )
while new_row [ " Question Text " ] . values [ 0 ] == state . random_question : # Avoid picking the same question twice (the change is not visible on the UI)
new_row = df . sample ( 1 )
state . random_question = new_row [ " Question Text " ] . values [ 0 ]
state . random_answer = new_row [ " Answer " ] . values [ 0 ]
2021-04-30 14:16:30 +05:30
2021-09-27 16:40:25 +02:00
# Search bar
2021-11-19 11:34:32 +01:00
question = st . text_input (
" Please provide your query: " ,
value = state . random_question ,
max_chars = 100 ,
on_change = reset_results
)
run_query = st . button ( " Run " )
2021-09-27 16:40:25 +02:00
2021-11-19 11:34:32 +01:00
# Check the connection
2021-09-27 16:40:25 +02:00
with st . spinner ( " ⌛️ Haystack is starting... " ) :
if not haystack_is_ready ( ) :
st . error ( " 🚫 Connection Error. Is Haystack running? " )
run_query = False
2021-11-19 11:34:32 +01:00
reset_results ( )
2021-09-27 16:40:25 +02:00
# Get results for query
2021-11-19 11:34:32 +01:00
if run_query and question :
reset_results ( )
2021-09-27 16:40:25 +02:00
with st . spinner (
" 🧠 Performing neural search on documents... \n "
" Do you want to optimize speed or accuracy? \n "
" Check out the docs: https://haystack.deepset.ai/usage/optimization "
) :
try :
2021-11-19 11:34:32 +01:00
state . results , state . raw_json = retrieve_doc ( question , top_k_reader = top_k_reader , top_k_retriever = top_k_retriever )
2021-11-22 19:06:08 +01:00
except JSONDecodeError as je :
st . error ( " 👓 An error occurred reading the results. Is the document store working? " )
return
2021-09-27 16:40:25 +02:00
except Exception as e :
logging . exception ( e )
2021-11-19 11:34:32 +01:00
if " The server is busy processing requests " in str ( e ) :
st . error ( " 🧑🌾 All our workers are busy! Try again later. " )
else :
st . error ( " 🐞 An error occurred during the request. Check the logs in the console to know more. " )
2021-09-27 16:40:25 +02:00
return
2021-11-19 11:34:32 +01:00
if state . results :
# Show the gold answer if we use a question of the given set
if question == state . random_question and eval_mode :
2021-09-27 16:40:25 +02:00
st . write ( " ## Correct answers: " )
2021-11-19 11:34:32 +01:00
st . write ( state . random_answer )
2021-09-27 16:40:25 +02:00
st . write ( " ## Results: " )
2021-11-19 11:34:32 +01:00
count = 0 # Make every button key unique
2021-09-27 16:40:25 +02:00
2021-11-19 11:34:32 +01:00
for result in state . results :
2021-09-27 16:40:25 +02:00
if result [ " answer " ] :
2021-11-19 11:34:32 +01:00
answer , context = result [ " answer " ] , result [ " context " ]
start_idx = context . find ( answer )
end_idx = start_idx + len ( answer )
2021-11-22 19:06:08 +01:00
# Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
st . write ( markdown ( context [ : start_idx ] + str ( annotation ( answer , " ANSWER " , " #8ef " ) ) + context [ end_idx : ] ) , unsafe_allow_html = True )
st . write ( " **Relevance:** " , result [ " relevance " ] , " **Source:** " , result [ " source " ] )
2021-11-19 11:34:32 +01:00
2021-11-22 19:06:08 +01:00
else :
st . warning ( " 🤔 Haystack found no good answer to your question. Try to formulate it differently! " )
st . write ( " **Relevance:** " , result [ " relevance " ] )
2021-09-27 16:40:25 +02:00
if eval_mode :
# Define columns for buttons
2021-11-19 11:34:32 +01:00
button_col1 , button_col2 , button_col3 , _ = st . columns ( [ 1 , 1 , 1 , 6 ] )
if button_col1 . button ( " 👍 " , key = f " { result [ ' context ' ] } { count } 1 " , help = " Correct answer " ) :
feedback_doc (
question = question ,
is_correct_answer = " true " ,
2021-11-22 19:06:08 +01:00
document_id = result . get ( " document_id " , None ) ,
2021-11-19 11:34:32 +01:00
model_id = 1 ,
is_correct_document = " true " ,
2021-11-22 19:06:08 +01:00
answer = result [ " answer " ] ,
offset_start_in_doc = result . get ( " offset_start_in_doc " , None )
2021-09-27 16:40:25 +02:00
)
2021-11-19 11:34:32 +01:00
st . success ( " ✨ Thanks for your feedback! ✨ " )
if button_col2 . button ( " 👎 " , key = f " { result [ ' context ' ] } { count } 2 " , help = " Wrong answer and wrong passage " ) :
feedback_doc (
question = question ,
is_correct_answer = " false " ,
2021-11-22 19:06:08 +01:00
document_id = result . get ( " document_id " , None ) ,
2021-11-19 11:34:32 +01:00
model_id = 1 ,
is_correct_document = " false " ,
answer = result [ " answer " ] ,
2021-11-22 19:06:08 +01:00
offset_start_in_doc = result . get ( " offset_start_in_doc " , None )
2021-09-27 16:40:25 +02:00
)
2021-11-19 11:34:32 +01:00
st . success ( " ✨ Thanks for your feedback! ✨ " )
if button_col3 . button ( " 👎👍 " , key = f " { result [ ' context ' ] } { count } 3 " , help = " Wrong answer, but correct passage " ) :
feedback_doc (
question = question ,
is_correct_answer = " false " ,
2021-11-22 19:06:08 +01:00
document_id = result . get ( " document_id " , None ) ,
2021-11-19 11:34:32 +01:00
model_id = 1 ,
is_correct_document = " true " ,
answer = result [ " answer " ] ,
2021-11-22 19:06:08 +01:00
offset_start_in_doc = result . get ( " offset_start_in_doc " , None )
2021-09-27 16:40:25 +02:00
)
2021-11-19 11:34:32 +01:00
st . success ( " ✨ Thanks for your feedback! ✨ " )
2021-09-27 16:40:25 +02:00
count + = 1
st . write ( " ___ " )
2021-11-19 11:34:32 +01:00
2021-09-27 16:40:25 +02:00
if debug :
st . subheader ( " REST API JSON response " )
2021-11-19 11:34:32 +01:00
st . write ( state . raw_json )
2021-09-27 16:40:25 +02:00
main ( )