mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-08 21:28:00 +00:00
Add evaluation and document conversion to tutorial 15 (#2325)
* update tutorial 15 with newer features * Update Documentation & Code Style * fix tutorial 15 * update telemetry with tutorial changes * Update Documentation & Code Style * remove error output * add output * update non-notebook tutorial 15 * Update Documentation & Code Style * delete distracting output from tutorial 15 notebook * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
834f8c4902
commit
eb514a6167
@ -84,7 +84,7 @@ document_store = ElasticsearchDocumentStore(
|
||||
```
|
||||
|
||||
## Add Tables to DocumentStore
|
||||
To quickly demonstrate the capabilities of the `TableTextRetriever` and the `TableReader` we use a subset of 1000 tables of the [Open Table-and-Text Question Answering (OTT-QA) dataset](https://github.com/wenhuchen/OTT-QA).
|
||||
To quickly demonstrate the capabilities of the `TableTextRetriever` and the `TableReader` we use a subset of 1000 tables and text documents from a dataset we have published in [this paper](https://arxiv.org/abs/2108.04049).
|
||||
|
||||
Just as text passages, tables are represented as `Document` objects in Haystack. The content field, though, is a pandas DataFrame instead of a string.
|
||||
|
||||
@ -95,7 +95,7 @@ Just as text passages, tables are represented as `Document` objects in Haystack.
|
||||
from haystack.utils import fetch_archive_from_http
|
||||
|
||||
doc_dir = "data/tutorial15"
|
||||
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/ottqa_sample.zip"
|
||||
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/table_text_dataset.zip"
|
||||
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
|
||||
```
|
||||
|
||||
@ -108,7 +108,7 @@ from haystack import Document
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def read_ottqa_tables(filename):
|
||||
def read_tables(filename):
|
||||
processed_tables = []
|
||||
with open(filename) as tables:
|
||||
tables = json.load(tables)
|
||||
@ -116,20 +116,13 @@ def read_ottqa_tables(filename):
|
||||
current_columns = table["header"]
|
||||
current_rows = table["data"]
|
||||
current_df = pd.DataFrame(columns=current_columns, data=current_rows)
|
||||
current_doc_title = table["title"]
|
||||
current_section_title = table["section_title"]
|
||||
document = Document(
|
||||
content=current_df,
|
||||
content_type="table",
|
||||
meta={"title": current_doc_title, "section_title": current_section_title},
|
||||
id=key,
|
||||
)
|
||||
document = Document(content=current_df, content_type="table", id=key)
|
||||
processed_tables.append(document)
|
||||
|
||||
return processed_tables
|
||||
|
||||
|
||||
tables = read_ottqa_tables(f"{doc_dir}/ottqa_tables_sample.json")
|
||||
tables = read_tables(f"{doc_dir}/tables.json")
|
||||
document_store.write_documents(tables, index=document_index)
|
||||
|
||||
# Showing content field and meta field of one of the Documents of content_type 'table'
|
||||
@ -161,7 +154,6 @@ retriever = TableTextRetriever(
|
||||
query_embedding_model="deepset/bert-small-mm_retrieval-question_encoder",
|
||||
passage_embedding_model="deepset/bert-small-mm_retrieval-passage_encoder",
|
||||
table_embedding_model="deepset/bert-small-mm_retrieval-table_encoder",
|
||||
embed_meta_fields=["title", "section_title"],
|
||||
)
|
||||
```
|
||||
|
||||
@ -183,7 +175,7 @@ document_store.update_embeddings(retriever=retriever)
|
||||
# Try the Retriever
|
||||
from haystack.utils import print_documents
|
||||
|
||||
retrieved_tables = retriever.retrieve("How many twin buildings are under construction?", top_k=5)
|
||||
retrieved_tables = retriever.retrieve("Who won the Super Bowl?", top_k=5)
|
||||
# Get highest scored table
|
||||
print(retrieved_tables[0].content)
|
||||
```
|
||||
@ -202,9 +194,9 @@ reader = TableReader(model_name_or_path="google/tapas-base-finetuned-wtq", max_s
|
||||
|
||||
|
||||
```python
|
||||
# Try the TableReader on one Table (highest-scored retrieved table from previous section)
|
||||
# Try the TableReader on one Table
|
||||
|
||||
table_doc = document_store.get_document_by_id("List_of_tallest_twin_buildings_and_structures_in_the_world_1")
|
||||
table_doc = document_store.get_document_by_id("36964e90-3735-4ba1-8e6a-bec236e88bb2")
|
||||
print(table_doc.content)
|
||||
```
|
||||
|
||||
@ -212,14 +204,12 @@ print(table_doc.content)
|
||||
```python
|
||||
from haystack.utils import print_answers
|
||||
|
||||
prediction = reader.predict(query="How many twin buildings are under construction?", documents=[table_doc])
|
||||
prediction = reader.predict(query="Who played Gregory House in the series House?", documents=[table_doc])
|
||||
print_answers(prediction, details="all")
|
||||
```
|
||||
|
||||
The offsets in the `offsets_in_document` and `offsets_in_context` field indicate the table cells that the model predicts to be part of the answer. They need to be interpreted on the linearized table, i.e., a flat list containing all of the table cells.
|
||||
|
||||
In the `Answer`'s meta field, you can find the aggreagtion operator used to construct the answer (in this case `COUNT`) and the answer cells as strings.
|
||||
|
||||
|
||||
```python
|
||||
print(f"Predicted answer: {prediction['answers'][0].answer}")
|
||||
@ -243,34 +233,27 @@ table_qa_pipeline.add_node(component=reader, name="TableReader", inputs=["TableT
|
||||
|
||||
|
||||
```python
|
||||
prediction = table_qa_pipeline.run("How many twin buildings are under construction?")
|
||||
prediction = table_qa_pipeline.run("When was Guilty Gear Xrd : Sign released?", params={"top_k": 30})
|
||||
print_answers(prediction, details="minimum")
|
||||
```
|
||||
|
||||
# Open-Domain QA on Text and Tables
|
||||
With haystack, you not only have the possibility to do QA on texts or tables, solely, but you can also use both texts and tables as your source of information.
|
||||
|
||||
To demonstrate this, we add 1,000 sample text passages from the OTT-QA dataset.
|
||||
|
||||
|
||||
```python
|
||||
# Add 1,000 text passages from OTT-QA to our document store.
|
||||
# Add 500 text passages to our document store.
|
||||
|
||||
|
||||
def read_ottqa_texts(filename):
|
||||
def read_texts(filename):
|
||||
processed_passages = []
|
||||
with open(filename) as passages:
|
||||
passages = json.load(passages)
|
||||
for title, content in passages.items():
|
||||
title = title[6:]
|
||||
title = title.replace("_", " ")
|
||||
document = Document(content=content, content_type="text", meta={"title": title})
|
||||
for key, content in passages.items():
|
||||
document = Document(content=content, content_type="text", id=key)
|
||||
processed_passages.append(document)
|
||||
|
||||
return processed_passages
|
||||
|
||||
|
||||
passages = read_ottqa_texts(f"{doc_dir}/ottqa_texts_sample.json")
|
||||
passages = read_texts(f"{doc_dir}/texts.json")
|
||||
document_store.write_documents(passages, index=document_index)
|
||||
```
|
||||
|
||||
@ -321,7 +304,7 @@ display.Image("pipeline.png")
|
||||
|
||||
```python
|
||||
# Example query whose answer resides in a text passage
|
||||
predictions = text_table_qa_pipeline.run(query="Who is Aleksandar Trifunovic?")
|
||||
predictions = text_table_qa_pipeline.run(query="Who was Thomas Alva Edison?")
|
||||
```
|
||||
|
||||
|
||||
@ -333,7 +316,7 @@ print_answers(predictions, details="minimum")
|
||||
|
||||
```python
|
||||
# Example query whose answer resides in a table
|
||||
predictions = text_table_qa_pipeline.run(query="What is Cuba's national tree?")
|
||||
predictions = text_table_qa_pipeline.run(query="Which country does the film Macaroni come from?")
|
||||
```
|
||||
|
||||
|
||||
@ -342,6 +325,78 @@ predictions = text_table_qa_pipeline.run(query="What is Cuba's national tree?")
|
||||
print_answers(predictions, details="minimum")
|
||||
```
|
||||
|
||||
## Evaluation
|
||||
To evaluate our pipeline, we can use haystack's evaluation feature. We just need to convert our labels into `MultiLabel` objects and the `eval` method will do the rest.
|
||||
|
||||
|
||||
```python
|
||||
from haystack import Label, MultiLabel, Answer
|
||||
|
||||
|
||||
def read_labels(filename, tables):
|
||||
processed_labels = []
|
||||
with open(filename) as labels:
|
||||
labels = json.load(labels)
|
||||
for table in tables:
|
||||
if table.id not in labels:
|
||||
continue
|
||||
label = labels[table.id]
|
||||
label = Label(
|
||||
query=label["query"],
|
||||
document=table,
|
||||
is_correct_answer=True,
|
||||
is_correct_document=True,
|
||||
answer=Answer(answer=label["answer"]),
|
||||
origin="gold-label",
|
||||
)
|
||||
processed_labels.append(MultiLabel(labels=[label]))
|
||||
return processed_labels
|
||||
|
||||
|
||||
table_labels = read_labels(f"{doc_dir}/labels.json", tables)
|
||||
passage_labels = read_labels(f"{doc_dir}/labels.json", passages)
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
eval_results = text_table_qa_pipeline.eval(table_labels + passage_labels, params={"top_k": 10})
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
# Calculating and printing the evaluation metrics
|
||||
print(eval_results.calculate_metrics())
|
||||
```
|
||||
|
||||
## Adding tables from PDFs
|
||||
It can sometimes be hard to provide your data in form of a pandas DataFrame. For this case, we provide the `ParsrConverter` wrapper that can help you to convert, for example, a PDF file into a document that you can index.
|
||||
|
||||
|
||||
```python
|
||||
!docker run -d -p 3001:3001 axarev/parsr
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
!wget https://www.w3.org/WAI/WCAG21/working-examples/pdf-table/table.pdf
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
from haystack.nodes import ParsrConverter
|
||||
|
||||
converter = ParsrConverter()
|
||||
|
||||
docs = converter.convert("table.pdf")
|
||||
|
||||
tables = [doc for doc in docs if doc["content_type"] == "table"]
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
print(tables)
|
||||
```
|
||||
|
||||
## About us
|
||||
|
||||
This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany
|
||||
|
||||
@ -370,7 +370,7 @@ class TableReader(BaseReader):
|
||||
else:
|
||||
raise KeyError("unknown aggregator")
|
||||
|
||||
return f"{answer_value}{' ' + unit if unit else ''}"
|
||||
return f"{answer_value}{' ' + unit if unit else ''}"
|
||||
|
||||
except KeyError as e:
|
||||
if "unknown aggregator" in str(e):
|
||||
|
||||
@ -225,7 +225,7 @@ class Document:
|
||||
|
||||
def __str__(self):
|
||||
# In some cases, self.content is None (therefore not subscriptable)
|
||||
if not self.content:
|
||||
if self.content is None:
|
||||
return f"<Document: id={self.id}, content=None>"
|
||||
return f"<Document: id={self.id}, content='{self.content[:100]} {'...' if len(self.content) > 100 else ''}'>"
|
||||
|
||||
|
||||
@ -202,7 +202,7 @@ def send_tutorial_event(url: str):
|
||||
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt12.zip": "12",
|
||||
# Tutorial 13: no dataset available yet
|
||||
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt14.zip": "14",
|
||||
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/ottqa_sample.zip": "15",
|
||||
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/table_text_dataset.zip": "15",
|
||||
# "https://nlp.stanford.edu/data/glove.6B.zip": "16",
|
||||
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial16.zip": "16",
|
||||
}
|
||||
|
||||
File diff suppressed because one or more lines are too long
@ -1,12 +1,14 @@
|
||||
import os
|
||||
import json
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from haystack import Label, MultiLabel, Answer
|
||||
from haystack.utils import launch_es, fetch_archive_from_http, print_answers
|
||||
from haystack.document_stores import ElasticsearchDocumentStore
|
||||
from haystack import Document, Pipeline
|
||||
from haystack.nodes.retriever import TableTextRetriever
|
||||
from haystack.nodes import TableReader, FARMReader, RouteDocuments, JoinAnswers
|
||||
from haystack.nodes import TableReader, FARMReader, RouteDocuments, JoinAnswers, ParsrConverter
|
||||
|
||||
|
||||
def tutorial15_tableqa():
|
||||
@ -23,14 +25,14 @@ def tutorial15_tableqa():
|
||||
## Add Tables to DocumentStore
|
||||
|
||||
# Let's first fetch some tables that we want to query
|
||||
# Here: 1000 tables from OTT-QA
|
||||
# Here: 1000 tables + texts
|
||||
|
||||
doc_dir = "data/tutorial15"
|
||||
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/ottqa_sample.zip"
|
||||
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/table_text_dataset.zip"
|
||||
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
|
||||
|
||||
# Add the tables to the DocumentStore
|
||||
def read_ottqa_tables(filename):
|
||||
def read_tables(filename):
|
||||
processed_tables = []
|
||||
with open(filename) as tables:
|
||||
tables = json.load(tables)
|
||||
@ -38,19 +40,12 @@ def tutorial15_tableqa():
|
||||
current_columns = table["header"]
|
||||
current_rows = table["data"]
|
||||
current_df = pd.DataFrame(columns=current_columns, data=current_rows)
|
||||
current_doc_title = table["title"]
|
||||
current_section_title = table["section_title"]
|
||||
document = Document(
|
||||
content=current_df,
|
||||
content_type="table",
|
||||
meta={"title": current_doc_title, "section_title": current_section_title},
|
||||
id=key,
|
||||
)
|
||||
document = Document(content=current_df, content_type="table", id=key)
|
||||
processed_tables.append(document)
|
||||
|
||||
return processed_tables
|
||||
|
||||
tables = read_ottqa_tables(f"{doc_dir}/ottqa_tables_sample.json")
|
||||
tables = read_tables(f"{doc_dir}/tables.json")
|
||||
document_store.write_documents(tables, index="document")
|
||||
|
||||
### Retriever
|
||||
@ -79,7 +74,7 @@ def tutorial15_tableqa():
|
||||
# Try the Retriever
|
||||
from haystack.utils import print_documents
|
||||
|
||||
retrieved_tables = retriever.retrieve("How many twin buildings are under construction?", top_k=5)
|
||||
retrieved_tables = retriever.retrieve("Who won the Super Bowl?", top_k=5)
|
||||
# Get highest scored table
|
||||
print(retrieved_tables[0].content)
|
||||
|
||||
@ -93,12 +88,12 @@ def tutorial15_tableqa():
|
||||
|
||||
reader = TableReader(model_name_or_path="google/tapas-base-finetuned-wtq", max_seq_len=512)
|
||||
|
||||
# Try the TableReader on one Table (highest-scored retrieved table)
|
||||
# Try the TableReader on one Table
|
||||
|
||||
table_doc = document_store.get_document_by_id("List_of_tallest_twin_buildings_and_structures_in_the_world_1")
|
||||
table_doc = document_store.get_document_by_id("36964e90-3735-4ba1-8e6a-bec236e88bb2")
|
||||
print(table_doc.content)
|
||||
|
||||
prediction = reader.predict(query="How many twin buildings are under construction?", documents=[table_doc])
|
||||
prediction = reader.predict(query="Who played Gregory House in the series House?", documents=[table_doc])
|
||||
print_answers(prediction, details="minimum")
|
||||
|
||||
### Pipeline
|
||||
@ -112,7 +107,7 @@ def tutorial15_tableqa():
|
||||
table_qa_pipeline.add_node(component=retriever, name="TableTextRetriever", inputs=["Query"])
|
||||
table_qa_pipeline.add_node(component=reader, name="TableReader", inputs=["TableTextRetriever"])
|
||||
|
||||
prediction = table_qa_pipeline.run("How many twin buildings are under construction?")
|
||||
prediction = table_qa_pipeline.run("When was Guilty Gear Xrd : Sign released?")
|
||||
print_answers(prediction, details="minimum")
|
||||
|
||||
### Pipeline for QA on Combination of Text and Tables
|
||||
@ -136,16 +131,72 @@ def tutorial15_tableqa():
|
||||
text_table_qa_pipeline.add_node(component=table_reader, name="TableReader", inputs=["RouteDocuments.output_2"])
|
||||
text_table_qa_pipeline.add_node(component=join_answers, name="JoinAnswers", inputs=["TextReader", "TableReader"])
|
||||
|
||||
# Add texts to the document store
|
||||
def read_texts(filename):
|
||||
processed_passages = []
|
||||
with open(filename) as passages:
|
||||
passages = json.load(passages)
|
||||
for key, content in passages.items():
|
||||
document = Document(content=content, content_type="text", id=key)
|
||||
processed_passages.append(document)
|
||||
|
||||
return processed_passages
|
||||
|
||||
passages = read_texts(f"{doc_dir}/texts.json")
|
||||
document_store.write_documents(passages)
|
||||
|
||||
# Example query whose answer resides in a text passage
|
||||
predictions = text_table_qa_pipeline.run(query="Who is Aleksandar Trifunovic?")
|
||||
predictions = text_table_qa_pipeline.run(query="Which country does the film Macaroni come from?")
|
||||
# We can see both text passages and tables as contexts of the predicted answers.
|
||||
print_answers(predictions, details="minimum")
|
||||
|
||||
# Example query whose answer resides in a table
|
||||
predictions = text_table_qa_pipeline.run(query="What is Cuba's national tree?")
|
||||
predictions = text_table_qa_pipeline.run(query="Who was Thomas Alva Edison?")
|
||||
# We can see both text passages and tables as contexts of the predicted answers.
|
||||
print_answers(predictions, details="minimum")
|
||||
|
||||
### Evaluation
|
||||
# To evaluate our pipeline, we can use haystack's evaluation feature. We just need to convert our labels into `MultiLabel` objects and the `eval` method will do the rest.
|
||||
|
||||
def read_labels(filename, tables):
|
||||
processed_labels = []
|
||||
with open(filename) as labels:
|
||||
labels = json.load(labels)
|
||||
for table in tables:
|
||||
if table.id not in labels:
|
||||
continue
|
||||
label = labels[table.id]
|
||||
label = Label(
|
||||
query=label["query"],
|
||||
document=table,
|
||||
is_correct_answer=True,
|
||||
is_correct_document=True,
|
||||
answer=Answer(answer=label["answer"]),
|
||||
origin="gold-label",
|
||||
)
|
||||
processed_labels.append(MultiLabel(labels=[label]))
|
||||
return processed_labels
|
||||
|
||||
table_labels = read_labels(f"{doc_dir}/labels.json", tables)
|
||||
passage_labels = read_labels(f"{doc_dir}/labels.json", passages)
|
||||
|
||||
eval_results = text_table_qa_pipeline.eval(table_labels + passage_labels, params={"top_k": 10})
|
||||
|
||||
# Calculating and printing the evaluation metrics
|
||||
print(eval_results.calculate_metrics())
|
||||
|
||||
## Adding tables from PDFs
|
||||
# It can sometimes be hard to provide your data in form of a pandas DataFrame.
|
||||
# For this case, we provide the `ParsrConverter` wrapper that can help you to convert, for example, a PDF file into a document that you can index.
|
||||
os.system("docker run -d -p 3001:3001 axarev/parsr")
|
||||
os.system("wget https://www.w3.org/WAI/WCAG21/working-examples/pdf-table/table.pdf")
|
||||
|
||||
converter = ParsrConverter()
|
||||
docs = converter.convert("table.pdf")
|
||||
tables = [doc for doc in docs if doc["content_type"] == "table"]
|
||||
|
||||
print(tables)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tutorial15_tableqa()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user