import json import pandas as pd from haystack.utils import launch_es, fetch_archive_from_http, print_answers from haystack.document_stores import ElasticsearchDocumentStore from haystack import Document, Pipeline from haystack.nodes.retriever import TableTextRetriever from haystack.nodes import TableReader def tutorial15_tableqa(): # Recommended: Start Elasticsearch using Docker via the Haystack utility function launch_es() ## Connect to Elasticsearch # We want to use a small model producing 512-dimensional embeddings, so we need to set embedding_dim to 512 document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", embedding_dim=512) ## Add Tables to DocumentStore # Let's first fetch some tables that we want to query # Here: 1000 tables from OTT-QA doc_dir = "data" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/ottqa_tables_sample.json.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # Add the tables to the DocumentStore def read_ottqa_tables(filename): processed_tables = [] with open(filename) as tables: tables = json.load(tables) for key, table in tables.items(): current_columns = table["header"] current_rows = table["data"] current_df = pd.DataFrame(columns=current_columns, data=current_rows) current_doc_title = table["title"] current_section_title = table["section_title"] document = Document( content=current_df, content_type="table", meta={"title": current_doc_title, "section_title": current_section_title}, id=key ) processed_tables.append(document) return processed_tables tables = read_ottqa_tables("data/ottqa_tables_sample.json") document_store.write_documents(tables, index="document") ### Retriever # Retrievers help narrowing down the scope for the Reader to a subset of tables where a given question could be answered. # They use some simple but fast algorithm. # # **Here:** We use the TableTextRetriever capable of retrieving relevant content among a database # of texts and tables using dense embeddings. retriever = TableTextRetriever( document_store=document_store, query_embedding_model="deepset/bert-small-mm_retrieval-question_encoder", passage_embedding_model="deepset/bert-small-mm_retrieval-passage_encoder", table_embedding_model="deepset/bert-small-mm_retrieval-table_encoder", embed_meta_fields=["title", "section_title"] ) # Add table embeddings to the tables in DocumentStore document_store.update_embeddings(retriever=retriever) ## Alternative: ElasticsearchRetriever #from haystack.nodes.retriever import ElasticsearchRetriever #retriever = ElasticsearchRetriever(document_store=document_store) # Try the Retriever from haystack.utils import print_documents retrieved_tables = retriever.retrieve("How many twin buildings are under construction?", top_k=5) # Get highest scored table print(retrieved_tables[0].content) ### Reader # The TableReader is based on TaPas, a transformer-based language model capable of grasping the two-dimensional structure of a table. # It scans the tables returned by the retriever and extracts the anser. # The available TableReader models can be found [here](https://huggingface.co/models?pipeline_tag=table-question-answering&sort=downloads). # # **Notice**: The TableReader will return an answer for each table, even if the query cannot be answered by the table. # Furthermore, the confidence scores are not useful as of now, given that they will *always* be very high (i.e. 1 or close to 1). reader = TableReader(model_name_or_path="google/tapas-base-finetuned-wtq", max_seq_len=512) # Try the TableReader on one Table (highest-scored retrieved table) table_doc = document_store.get_document_by_id("List_of_tallest_twin_buildings_and_structures_in_the_world_1") print(table_doc.content) prediction = reader.predict(query="How many twin buildings are under construction?", documents=[table_doc]) print_answers(prediction, details="minimal") ### Pipeline # The Retriever and the Reader can be sticked together to a pipeline in order to first retrieve relevant tables # and then extract the answer. # # **Notice**: Given that the `TableReader` does not provide useful confidence scores and returns an answer # for each of the tables, the sorting of the answers might be not helpful. table_qa_pipeline = Pipeline() table_qa_pipeline.add_node(component=retriever, name="TableTextRetriever", inputs=["Query"]) table_qa_pipeline.add_node(component=reader, name="TableReader", inputs=["TableTextRetriever"]) prediction = table_qa_pipeline.run("How many twin buildings are under construction?") print_answers(prediction, details="minimal") if __name__ == "__main__": tutorial15_tableqa() # This Haystack script was made with love by deepset in Berlin, Germany # Haystack: https://github.com/deepset-ai/haystack # deepset: https://deepset.ai/