2022-03-29 17:09:05 +02:00
import os
2021-10-29 11:07:13 +02:00
import json
2022-05-05 10:12:44 +02:00
import time
2021-10-29 11:07:13 +02:00
import pandas as pd
2022-03-29 17:09:05 +02:00
from haystack import Label , MultiLabel , Answer
2021-10-29 11:07:13 +02:00
from haystack . utils import launch_es , fetch_archive_from_http , print_answers
from haystack . document_stores import ElasticsearchDocumentStore
from haystack import Document , Pipeline
2022-05-05 10:12:44 +02:00
from haystack . nodes . retriever import EmbeddingRetriever
2022-03-29 17:09:05 +02:00
from haystack . nodes import TableReader , FARMReader , RouteDocuments , JoinAnswers , ParsrConverter
2021-10-29 11:07:13 +02:00
def tutorial15_tableqa ( ) :
# Recommended: Start Elasticsearch using Docker via the Haystack utility function
launch_es ( )
## Connect to Elasticsearch
2022-05-05 10:12:44 +02:00
document_store = ElasticsearchDocumentStore ( host = " localhost " , username = " " , password = " " , index = " document " )
2021-10-29 11:07:13 +02:00
## Add Tables to DocumentStore
# Let's first fetch some tables that we want to query
2022-03-29 17:09:05 +02:00
# Here: 1000 tables + texts
2021-10-29 11:07:13 +02:00
2022-03-21 11:58:51 +01:00
doc_dir = " data/tutorial15 "
2022-03-29 17:09:05 +02:00
s3_url = " https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/table_text_dataset.zip "
2021-10-29 11:07:13 +02:00
fetch_archive_from_http ( url = s3_url , output_dir = doc_dir )
# Add the tables to the DocumentStore
2022-03-29 17:09:05 +02:00
def read_tables ( filename ) :
2021-10-29 11:07:13 +02:00
processed_tables = [ ]
with open ( filename ) as tables :
tables = json . load ( tables )
for key , table in tables . items ( ) :
current_columns = table [ " header " ]
current_rows = table [ " data " ]
current_df = pd . DataFrame ( columns = current_columns , data = current_rows )
2022-03-29 17:09:05 +02:00
document = Document ( content = current_df , content_type = " table " , id = key )
2021-10-29 11:07:13 +02:00
processed_tables . append ( document )
return processed_tables
2022-03-29 17:09:05 +02:00
tables = read_tables ( f " { doc_dir } /tables.json " )
2021-10-29 11:07:13 +02:00
document_store . write_documents ( tables , index = " document " )
### Retriever
# Retrievers help narrowing down the scope for the Reader to a subset of tables where a given question could be answered.
# They use some simple but fast algorithm.
#
2022-05-05 10:12:44 +02:00
# **Here:** We use the EmbeddingRetriever capable of retrieving relevant content among a database
2021-10-29 11:07:13 +02:00
# of texts and tables using dense embeddings.
2022-05-05 10:12:44 +02:00
retriever = EmbeddingRetriever (
2021-10-29 11:07:13 +02:00
document_store = document_store ,
2022-05-05 10:12:44 +02:00
embedding_model = " deepset/all-mpnet-base-v2-table " ,
model_format = " sentence_transformers " ,
2021-10-29 11:07:13 +02:00
)
# Add table embeddings to the tables in DocumentStore
document_store . update_embeddings ( retriever = retriever )
2022-04-26 16:09:39 +02:00
## Alternative: BM25Retriever
# from haystack.nodes.retriever import BM25Retriever
# retriever = BM25Retriever(document_store=document_store)
2021-10-29 11:07:13 +02:00
# Try the Retriever
from haystack . utils import print_documents
2022-03-29 17:09:05 +02:00
retrieved_tables = retriever . retrieve ( " Who won the Super Bowl? " , top_k = 5 )
2021-10-29 11:07:13 +02:00
# Get highest scored table
print ( retrieved_tables [ 0 ] . content )
### Reader
# The TableReader is based on TaPas, a transformer-based language model capable of grasping the two-dimensional structure of a table.
# It scans the tables returned by the retriever and extracts the anser.
# The available TableReader models can be found [here](https://huggingface.co/models?pipeline_tag=table-question-answering&sort=downloads).
#
# **Notice**: The TableReader will return an answer for each table, even if the query cannot be answered by the table.
# Furthermore, the confidence scores are not useful as of now, given that they will *always* be very high (i.e. 1 or close to 1).
reader = TableReader ( model_name_or_path = " google/tapas-base-finetuned-wtq " , max_seq_len = 512 )
2022-03-29 17:09:05 +02:00
# Try the TableReader on one Table
2021-10-29 11:07:13 +02:00
2022-03-29 17:09:05 +02:00
table_doc = document_store . get_document_by_id ( " 36964e90-3735-4ba1-8e6a-bec236e88bb2 " )
2021-10-29 11:07:13 +02:00
print ( table_doc . content )
2022-03-29 17:09:05 +02:00
prediction = reader . predict ( query = " Who played Gregory House in the series House? " , documents = [ table_doc ] )
2021-11-12 16:44:28 +01:00
print_answers ( prediction , details = " minimum " )
2021-10-29 11:07:13 +02:00
### Pipeline
2021-11-08 17:14:04 +01:00
# The Retriever and the Reader can be sticked together to a pipeline in order to first retrieve relevant tables
2021-10-29 11:07:13 +02:00
# and then extract the answer.
#
# **Notice**: Given that the `TableReader` does not provide useful confidence scores and returns an answer
# for each of the tables, the sorting of the answers might be not helpful.
table_qa_pipeline = Pipeline ( )
2022-05-05 10:12:44 +02:00
table_qa_pipeline . add_node ( component = retriever , name = " EmbeddingRetriever " , inputs = [ " Query " ] )
table_qa_pipeline . add_node ( component = reader , name = " TableReader " , inputs = [ " EmbeddingRetriever " ] )
2021-10-29 11:07:13 +02:00
2022-03-29 17:09:05 +02:00
prediction = table_qa_pipeline . run ( " When was Guilty Gear Xrd : Sign released? " )
2021-11-12 16:44:28 +01:00
print_answers ( prediction , details = " minimum " )
2022-03-01 17:42:11 +01:00
### Pipeline for QA on Combination of Text and Tables
2022-05-05 10:12:44 +02:00
# We are using one node for retrieving both texts and tables, the EmbeddingRetriever.
# In order to do question-answering on the Documents coming from the EmbeddingRetriever, we need to route
2022-03-01 17:42:11 +01:00
# Documents of type "text" to a FARMReader ( or alternatively TransformersReader) and Documents of type
# "table" to a TableReader.
text_reader = FARMReader ( " deepset/roberta-base-squad2 " )
# In order to get meaningful scores from the TableReader, use "deepset/tapas-large-nq-hn-reader" or
# "deepset/tapas-large-nq-reader" as TableReader models. The disadvantage of these models is, however,
# that they are not capable of doing aggregations over multiple table cells.
table_reader = TableReader ( " deepset/tapas-large-nq-hn-reader " )
route_documents = RouteDocuments ( )
join_answers = JoinAnswers ( )
text_table_qa_pipeline = Pipeline ( )
2022-05-05 10:12:44 +02:00
text_table_qa_pipeline . add_node ( component = retriever , name = " EmbeddingRetriever " , inputs = [ " Query " ] )
text_table_qa_pipeline . add_node ( component = route_documents , name = " RouteDocuments " , inputs = [ " EmbeddingRetriever " ] )
2022-03-01 17:42:11 +01:00
text_table_qa_pipeline . add_node ( component = text_reader , name = " TextReader " , inputs = [ " RouteDocuments.output_1 " ] )
text_table_qa_pipeline . add_node ( component = table_reader , name = " TableReader " , inputs = [ " RouteDocuments.output_2 " ] )
text_table_qa_pipeline . add_node ( component = join_answers , name = " JoinAnswers " , inputs = [ " TextReader " , " TableReader " ] )
2022-03-29 17:09:05 +02:00
# Add texts to the document store
def read_texts ( filename ) :
processed_passages = [ ]
with open ( filename ) as passages :
passages = json . load ( passages )
for key , content in passages . items ( ) :
document = Document ( content = content , content_type = " text " , id = key )
processed_passages . append ( document )
return processed_passages
passages = read_texts ( f " { doc_dir } /texts.json " )
document_store . write_documents ( passages )
2022-03-01 17:42:11 +01:00
# Example query whose answer resides in a text passage
2022-03-29 17:09:05 +02:00
predictions = text_table_qa_pipeline . run ( query = " Which country does the film Macaroni come from? " )
2022-03-01 17:42:11 +01:00
# We can see both text passages and tables as contexts of the predicted answers.
print_answers ( predictions , details = " minimum " )
# Example query whose answer resides in a table
2022-03-29 17:09:05 +02:00
predictions = text_table_qa_pipeline . run ( query = " Who was Thomas Alva Edison? " )
2022-03-01 17:42:11 +01:00
# We can see both text passages and tables as contexts of the predicted answers.
print_answers ( predictions , details = " minimum " )
2022-03-29 17:09:05 +02:00
### Evaluation
# To evaluate our pipeline, we can use haystack's evaluation feature. We just need to convert our labels into `MultiLabel` objects and the `eval` method will do the rest.
def read_labels ( filename , tables ) :
processed_labels = [ ]
with open ( filename ) as labels :
labels = json . load ( labels )
for table in tables :
if table . id not in labels :
continue
label = labels [ table . id ]
label = Label (
query = label [ " query " ] ,
document = table ,
is_correct_answer = True ,
is_correct_document = True ,
answer = Answer ( answer = label [ " answer " ] ) ,
origin = " gold-label " ,
)
processed_labels . append ( MultiLabel ( labels = [ label ] ) )
return processed_labels
table_labels = read_labels ( f " { doc_dir } /labels.json " , tables )
passage_labels = read_labels ( f " { doc_dir } /labels.json " , passages )
eval_results = text_table_qa_pipeline . eval ( table_labels + passage_labels , params = { " top_k " : 10 } )
# Calculating and printing the evaluation metrics
print ( eval_results . calculate_metrics ( ) )
## Adding tables from PDFs
# It can sometimes be hard to provide your data in form of a pandas DataFrame.
# For this case, we provide the `ParsrConverter` wrapper that can help you to convert, for example, a PDF file into a document that you can index.
os . system ( " docker run -d -p 3001:3001 axarev/parsr " )
2022-05-05 10:12:44 +02:00
time . sleep ( 30 )
2022-03-29 17:09:05 +02:00
os . system ( " wget https://www.w3.org/WAI/WCAG21/working-examples/pdf-table/table.pdf " )
converter = ParsrConverter ( )
docs = converter . convert ( " table.pdf " )
2022-05-05 10:12:44 +02:00
tables = [ doc for doc in docs if doc . content_type == " table " ]
2022-03-29 17:09:05 +02:00
print ( tables )
2021-10-29 11:07:13 +02:00
if __name__ == " __main__ " :
tutorial15_tableqa ( )
# This Haystack script was made with love by deepset in Berlin, Germany
# Haystack: https://github.com/deepset-ai/haystack
# deepset: https://deepset.ai/