From c5542bd3fbdcd6e211e1686aa289939ed29427f4 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Tue, 1 Mar 2022 17:42:11 +0100 Subject: [PATCH] Add `RouteDocuments` and `JoinAnswers` nodes (#2256) * Add SplitDocumentList and JoinAnswer nodes * Update Documentation & Code Style * Add tests + adapt tutorial * Update Documentation & Code Style * Remove branch from installation path in Tutorial * Update Documentation & Code Style * Fix typing * Update Documentation & Code Style * Change name of SplitDocumentList to RouteDocuments * Update Documentation & Code Style * Adapt tutorials to new name * Add test for JoinAnswers * Update Documentation & Code Style * Adapt name of test for JoinAnswers node Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/_src/tutorials/tutorials/15.md | 102 ++- haystack/__init__.py | 2 +- haystack/nodes/__init__.py | 2 +- haystack/nodes/other/__init__.py | 2 + haystack/nodes/other/join_answers.py | 64 ++ haystack/nodes/other/route_documents.py | 72 ++ haystack/pipelines/base.py | 52 +- .../haystack-pipeline-1.2.0.schema.json | 92 +++ .../haystack-pipeline-1.2.0rc0.schema.json | 92 +++ test/test_pipeline.py | 51 +- tutorials/Tutorial15_TableQA.ipynb | 654 +++++++++++++++++- tutorials/Tutorial15_TableQA.py | 33 +- 12 files changed, 1160 insertions(+), 58 deletions(-) create mode 100644 haystack/nodes/other/join_answers.py create mode 100644 haystack/nodes/other/route_documents.py diff --git a/docs/_src/tutorials/tutorials/15.md b/docs/_src/tutorials/tutorials/15.md index c6bdb6325..b2c606c5f 100644 --- a/docs/_src/tutorials/tutorials/15.md +++ b/docs/_src/tutorials/tutorials/15.md @@ -38,8 +38,9 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial # The TaPAs-based TableReader requires the torch-scatter library !pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu113.html -# If you run this notebook on Google Colab, you might need to -# restart the runtime after installing haystack. +# Install pygraphviz for visualization of Pipelines +!apt install libgraphviz-dev +!pip install pygraphviz ``` ### Start an Elasticsearch server @@ -94,7 +95,7 @@ Just as text passages, tables are represented as `Document` objects in Haystack. from haystack.utils import fetch_archive_from_http doc_dir = "data" -s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/ottqa_tables_sample.json.zip" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/ottqa_sample.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) ``` @@ -246,6 +247,101 @@ prediction = table_qa_pipeline.run("How many twin buildings are under constructi print_answers(prediction, details="minimum") ``` +# Open-Domain QA on Text and Tables +With haystack, you not only have the possibility to do QA on texts or tables, solely, but you can also use both texts and tables as your source of information. + +To demonstrate this, we add 1,000 sample text passages from the OTT-QA dataset. + + +```python +# Add 1,000 text passages from OTT-QA to our document store. + + +def read_ottqa_texts(filename): + processed_passages = [] + with open(filename) as passages: + passages = json.load(passages) + for title, content in passages.items(): + title = title[6:] + title = title.replace("_", " ") + document = Document(content=content, content_type="text", meta={"title": title}) + processed_passages.append(document) + + return processed_passages + + +passages = read_ottqa_texts(f"{doc_dir}/ottqa_texts_sample.json") +document_store.write_documents(passages, index=document_index) +``` + + +```python +document_store.update_embeddings(retriever=retriever, update_existing_embeddings=False) +``` + +## Pipeline for QA on Combination of Text and Tables +We are using one node for retrieving both texts and tables, the `TableTextRetriever`. In order to do question-answering on the Documents coming from the `TableTextRetriever`, we need to route Documents of type `"text"` to a `FARMReader` (or alternatively `TransformersReader`) and Documents of type `"table"` to a `TableReader`. + +To achieve this, we make use of two additional nodes: +- `SplitDocumentList`: Splits the List of Documents retrieved by the `TableTextRetriever` into two lists containing only Documents of type `"text"` or `"table"`, respectively. +- `JoinAnswers`: Takes Answers coming from two different Readers (in this case `FARMReader` and `TableReader`) and joins them to a single list of Answers. + + +```python +from haystack.nodes import FARMReader, RouteDocuments, JoinAnswers + +text_reader = FARMReader("deepset/roberta-base-squad2") +# In order to get meaningful scores from the TableReader, use "deepset/tapas-large-nq-hn-reader" or +# "deepset/tapas-large-nq-reader" as TableReader models. The disadvantage of these models is, however, +# that they are not capable of doing aggregations over multiple table cells. +table_reader = TableReader("deepset/tapas-large-nq-hn-reader") +route_documents = RouteDocuments() +join_answers = JoinAnswers() +``` + + +```python +text_table_qa_pipeline = Pipeline() +text_table_qa_pipeline.add_node(component=retriever, name="TableTextRetriever", inputs=["Query"]) +text_table_qa_pipeline.add_node(component=route_documents, name="RouteDocuments", inputs=["TableTextRetriever"]) +text_table_qa_pipeline.add_node(component=text_reader, name="TextReader", inputs=["RouteDocuments.output_1"]) +text_table_qa_pipeline.add_node(component=table_reader, name="TableReader", inputs=["RouteDocuments.output_2"]) +text_table_qa_pipeline.add_node(component=join_answers, name="JoinAnswers", inputs=["TextReader", "TableReader"]) +``` + + +```python +# Let's have a look on the structure of the combined Table an Text QA pipeline. +from IPython import display + +text_table_qa_pipeline.draw() +display.Image("pipeline.png") +``` + + +```python +# Example query whose answer resides in a text passage +predictions = text_table_qa_pipeline.run(query="Who is Aleksandar Trifunovic?") +``` + + +```python +# We can see both text passages and tables as contexts of the predicted answers. +print_answers(predictions, details="minimum") +``` + + +```python +# Example query whose answer resides in a table +predictions = text_table_qa_pipeline.run(query="What is Cuba's national tree?") +``` + + +```python +# We can see both text passages and tables as contexts of the predicted answers. +print_answers(predictions, details="minimum") +``` + ## About us This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany diff --git a/haystack/__init__.py b/haystack/__init__.py index ae67727cf..f023dc7ef 100644 --- a/haystack/__init__.py +++ b/haystack/__init__.py @@ -102,7 +102,7 @@ except ImportError: from haystack.modeling.evaluation import eval from haystack.modeling.logger import MLFlowLogger, StdoutLogger, TensorBoardLogger -from haystack.nodes.other import JoinDocuments, Docs2Answers +from haystack.nodes.other import JoinDocuments, Docs2Answers, JoinAnswers, RouteDocuments from haystack.nodes.query_classifier import SklearnQueryClassifier, TransformersQueryClassifier from haystack.nodes.file_classifier import FileTypeClassifier from haystack.utils import preprocessing diff --git a/haystack/nodes/__init__.py b/haystack/nodes/__init__.py index 52f2ae002..bc3d86239 100644 --- a/haystack/nodes/__init__.py +++ b/haystack/nodes/__init__.py @@ -21,7 +21,7 @@ from haystack.nodes.file_converter import ( AzureConverter, ParsrConverter, ) -from haystack.nodes.other import Docs2Answers, JoinDocuments +from haystack.nodes.other import Docs2Answers, JoinDocuments, RouteDocuments, JoinAnswers from haystack.nodes.preprocessor import BasePreProcessor, PreProcessor from haystack.nodes.query_classifier import SklearnQueryClassifier, TransformersQueryClassifier from haystack.nodes.question_generator import QuestionGenerator diff --git a/haystack/nodes/other/__init__.py b/haystack/nodes/other/__init__.py index 8341a135b..4cfb55cea 100644 --- a/haystack/nodes/other/__init__.py +++ b/haystack/nodes/other/__init__.py @@ -1,2 +1,4 @@ from haystack.nodes.other.docs2answers import Docs2Answers from haystack.nodes.other.join_docs import JoinDocuments +from haystack.nodes.other.route_documents import RouteDocuments +from haystack.nodes.other.join_answers import JoinAnswers diff --git a/haystack/nodes/other/join_answers.py b/haystack/nodes/other/join_answers.py new file mode 100644 index 000000000..e96652dfc --- /dev/null +++ b/haystack/nodes/other/join_answers.py @@ -0,0 +1,64 @@ +from typing import Optional, List, Dict, Tuple + +from haystack.schema import Answer +from haystack.nodes import BaseComponent + + +class JoinAnswers(BaseComponent): + """ + A node to join `Answer`s produced by multiple `Reader` nodes. + """ + + def __init__( + self, join_mode: str = "concatenate", weights: Optional[List[float]] = None, top_k_join: Optional[int] = None + ): + """ + :param join_mode: `"concatenate"` to combine documents from multiple `Reader`s. `"merge"` to aggregate scores + of individual `Answer`s. + :param weights: A node-wise list (length of list must be equal to the number of input nodes) of weights for + adjusting `Answer` scores when using the `"merge"` join_mode. By default, equal weight is assigned to each + `Reader` score. This parameter is not compatible with the `"concatenate"` join_mode. + :param top_k_join: Limit `Answer`s to top_k based on the resulting scored of the join. + """ + + assert join_mode in ["concatenate", "merge"], f"JoinAnswers node does not support '{join_mode}' join_mode." + assert not ( + weights is not None and join_mode == "concatenate" + ), "Weights are not compatible with 'concatenate' join_mode" + + # Save init parameters to enable export of component config as YAML + self.set_config(join_mode=join_mode, weights=weights, top_k_join=top_k_join) + + self.join_mode = join_mode + self.weights = [float(i) / sum(weights) for i in weights] if weights else None + self.top_k_join = top_k_join + + def run(self, inputs: List[Dict], top_k_join: Optional[int] = None) -> Tuple[Dict, str]: # type: ignore + reader_results = [inp["answers"] for inp in inputs] + + if not top_k_join: + top_k_join = self.top_k_join + + if self.join_mode == "concatenate": + concatenated_answers = [answer for cur_reader_result in reader_results for answer in cur_reader_result] + concatenated_answers = sorted(concatenated_answers, reverse=True)[:top_k_join] + return {"answers": concatenated_answers, "labels": inputs[0].get("labels", None)}, "output_1" + + elif self.join_mode == "merge": + merged_answers = self._merge_answers(reader_results) + + merged_answers = merged_answers[:top_k_join] + return {"answers": merged_answers, "labels": inputs[0].get("labels", None)}, "output_1" + + else: + raise ValueError(f"Invalid join_mode: {self.join_mode}") + + def _merge_answers(self, reader_results: List[List[Answer]]) -> List[Answer]: + weights = self.weights if self.weights else [1 / len(reader_results)] * len(reader_results) + + for result, weight in zip(reader_results, weights): + for answer in result: + if isinstance(answer.score, float): + answer.score *= weight + + return sorted([answer for cur_reader_result in reader_results for answer in cur_reader_result], reverse=True) diff --git a/haystack/nodes/other/route_documents.py b/haystack/nodes/other/route_documents.py new file mode 100644 index 000000000..f9ba7e3ed --- /dev/null +++ b/haystack/nodes/other/route_documents.py @@ -0,0 +1,72 @@ +from typing import List, Tuple, Dict, Optional + +from haystack.nodes.base import BaseComponent +from haystack.schema import Document + + +class RouteDocuments(BaseComponent): + """ + A node to split a list of `Document`s by `content_type` or by the values of a metadata field and route them to + different nodes. + """ + + # By default (split_by == "content_type"), the node has two outgoing edges. + outgoing_edges = 2 + + def __init__(self, split_by: str = "content_type", metadata_values: Optional[List[str]] = None): + """ + :param split_by: Field to split the documents by, either `"content_type"` or a metadata field name. + If this parameter is set to `"content_type"`, the list of `Document`s will be split into a list containing + only `Document`s of type `"text"` (will be routed to `"output_1"`) and a list containing only `Document`s of + type `"text"` (will be routed to `"output_2"`). + If this parameter is set to a metadata field name, you need to specify the parameter `metadata_values` as + well. + :param metadata_values: If the parameter `split_by` is set to a metadata field name, you need to provide a list + of values to group the `Document`s to. `Document`s whose metadata field is equal to the first value of the + provided list will be routed to `"output_1"`, `Document`s whose metadata field is equal to the second + value of the provided list will be routed to `"output_2"`, etc. + """ + + assert split_by == "content_type" or metadata_values is not None, ( + "If split_by is set to the name of a metadata field, you must provide metadata_values " + "to group the documents to." + ) + + # Save init parameters to enable export of component config as YAML + self.set_config(split_by=split_by, metadata_values=metadata_values) + + self.split_by = split_by + self.metadata_values = metadata_values + + # If we split list of Documents by a metadata field, number of outgoing edges might change + if split_by != "content_type" and metadata_values is not None: + self.outgoing_edges = len(metadata_values) + + def run(self, documents: List[Document]) -> Tuple[Dict, str]: # type: ignore + if self.split_by == "content_type": + split_documents: Dict[str, List[Document]] = {"output_1": [], "output_2": []} + + for doc in documents: + if doc.content_type == "text": + split_documents["output_1"].append(doc) + elif doc.content_type == "table": + split_documents["output_2"].append(doc) + + else: + assert isinstance(self.metadata_values, list), ( + "You need to provide metadata_values if you want to split" " a list of Documents by a metadata field." + ) + split_documents = {f"output_{i+1}": [] for i in range(len(self.metadata_values))} + for doc in documents: + current_metadata_value = doc.meta.get(self.split_by, None) + # Disregard current document if it does not contain the provided metadata field + if current_metadata_value is not None: + try: + index = self.metadata_values.index(current_metadata_value) + except ValueError: + # Disregard current document if current_metadata_value is not in the provided metadata_values + continue + + split_documents[f"output_{index+1}"].append(doc) + + return split_documents, "split_documents" diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index c9321d354..1358f7c88 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -645,28 +645,38 @@ class Pipeline(BasePipeline): f"Exception while running node `{node_id}` with input `{node_input}`: {e}, full stack trace: {tb}" ) queue.pop(node_id) - next_nodes = self.get_next_nodes(node_id, stream_id) - for n in next_nodes: # add successor nodes with corresponding inputs to the queue - if queue.get(n): # concatenate inputs if it's a join node - existing_input = queue[n] - if "inputs" not in existing_input.keys(): - updated_input: dict = {"inputs": [existing_input, node_output], "params": params} - if query: - updated_input["query"] = query - if file_paths: - updated_input["file_paths"] = file_paths - if labels: - updated_input["labels"] = labels - if documents: - updated_input["documents"] = documents - if meta: - updated_input["meta"] = meta + # + if stream_id == "split_documents": + for stream_id in [key for key in node_output.keys() if key.startswith("output_")]: + current_node_output = {k: v for k, v in node_output.items() if not k.startswith("output_")} + current_docs = node_output.pop(stream_id) + current_node_output["documents"] = current_docs + next_nodes = self.get_next_nodes(node_id, stream_id) + for n in next_nodes: + queue[n] = current_node_output + else: + next_nodes = self.get_next_nodes(node_id, stream_id) + for n in next_nodes: # add successor nodes with corresponding inputs to the queue + if queue.get(n): # concatenate inputs if it's a join node + existing_input = queue[n] + if "inputs" not in existing_input.keys(): + updated_input: dict = {"inputs": [existing_input, node_output], "params": params} + if query: + updated_input["query"] = query + if file_paths: + updated_input["file_paths"] = file_paths + if labels: + updated_input["labels"] = labels + if documents: + updated_input["documents"] = documents + if meta: + updated_input["meta"] = meta + else: + existing_input["inputs"].append(node_output) + updated_input = existing_input + queue[n] = updated_input else: - existing_input["inputs"].append(node_output) - updated_input = existing_input - queue[n] = updated_input - else: - queue[n] = node_output + queue[n] = node_output i = 0 else: i += 1 # attempt executing next node in the queue as current `node_id` has unprocessed predecessors diff --git a/json-schemas/haystack-pipeline-1.2.0.schema.json b/json-schemas/haystack-pipeline-1.2.0.schema.json index d425ee098..ac86f1de8 100644 --- a/json-schemas/haystack-pipeline-1.2.0.schema.json +++ b/json-schemas/haystack-pipeline-1.2.0.schema.json @@ -59,6 +59,9 @@ { "$ref": "#/definitions/ImageToTextConverterComponent" }, + { + "$ref": "#/definitions/JoinAnswersComponent" + }, { "$ref": "#/definitions/JoinDocumentsComponent" }, @@ -86,6 +89,9 @@ { "$ref": "#/definitions/RCIReaderComponent" }, + { + "$ref": "#/definitions/RouteDocumentsComponent" + }, { "$ref": "#/definitions/SentenceTransformersRankerComponent" }, @@ -1093,6 +1099,51 @@ ], "additionalProperties": false }, + "JoinAnswersComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "JoinAnswers" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "join_mode": { + "title": "Join Mode", + "default": "concatenate", + "type": "string" + }, + "weights": { + "title": "Weights", + "type": "array", + "items": { + "type": "number" + } + }, + "top_k_join": { + "title": "Top K Join", + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, "JoinDocumentsComponent": { "type": "object", "properties": { @@ -1646,6 +1697,47 @@ ], "additionalProperties": false }, + "RouteDocumentsComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "RouteDocuments" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "split_by": { + "title": "Split By", + "default": "content_type", + "type": "string" + }, + "metadata_values": { + "title": "Metadata Values", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, "SentenceTransformersRankerComponent": { "type": "object", "properties": { diff --git a/json-schemas/haystack-pipeline-1.2.0rc0.schema.json b/json-schemas/haystack-pipeline-1.2.0rc0.schema.json index 520d8ee0c..4387b996b 100644 --- a/json-schemas/haystack-pipeline-1.2.0rc0.schema.json +++ b/json-schemas/haystack-pipeline-1.2.0rc0.schema.json @@ -59,6 +59,9 @@ { "$ref": "#/definitions/ImageToTextConverterComponent" }, + { + "$ref": "#/definitions/JoinAnswersComponent" + }, { "$ref": "#/definitions/JoinDocumentsComponent" }, @@ -95,6 +98,9 @@ { "$ref": "#/definitions/SklearnQueryClassifierComponent" }, + { + "$ref": "#/definitions/SplitDocumentListComponent" + }, { "$ref": "#/definitions/TableReaderComponent" }, @@ -1093,6 +1099,51 @@ ], "additionalProperties": false }, + "JoinAnswersComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "JoinAnswers" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "join_mode": { + "title": "Join Mode", + "default": "concatenate", + "type": "string" + }, + "weights": { + "title": "Weights", + "type": "array", + "items": { + "type": "number" + } + }, + "top_k_join": { + "title": "Top K Join", + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, "JoinDocumentsComponent": { "type": "object", "properties": { @@ -1836,6 +1887,47 @@ ], "additionalProperties": false }, + "SplitDocumentListComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "SplitDocumentList" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "split_by": { + "title": "Split By", + "default": "content_type", + "type": "string" + }, + "metadata_values": { + "title": "Metadata Values", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, "TableReaderComponent": { "type": "object", "properties": { diff --git a/test/test_pipeline.py b/test/test_pipeline.py index 4299076c4..1b216e8f2 100644 --- a/test/test_pipeline.py +++ b/test/test_pipeline.py @@ -3,10 +3,12 @@ from pathlib import Path import os import json from unittest.mock import Mock + +import pandas as pd import pytest import responses -from haystack import __version__ +from haystack import __version__, Document, Answer, JoinAnswers from haystack.document_stores.base import BaseDocumentStore from haystack.document_stores.deepsetcloud import DeepsetCloudDocumentStore from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore @@ -17,7 +19,7 @@ from haystack.nodes.retriever.base import BaseRetriever from haystack.nodes.retriever.sparse import ElasticsearchRetriever from haystack.pipelines import Pipeline, DocumentSearchPipeline, RootNode, ExtractiveQAPipeline from haystack.pipelines.base import _PipelineCodeGen -from haystack.nodes import DensePassageRetriever, EmbeddingRetriever +from haystack.nodes import DensePassageRetriever, EmbeddingRetriever, RouteDocuments from conftest import MOCK_DC, DC_API_ENDPOINT, DC_API_KEY, DC_TEST_INDEX, SAMPLES_PATH, deepset_cloud_fixture @@ -1041,6 +1043,51 @@ def test_documentsearch_document_store_authentication(retriever_with_docs, docum assert kwargs["headers"] == auth_headers +def test_route_documents_by_content_type(): + # Test routing by content_type + docs = [ + Document(content="text document", content_type="text"), + Document( + content=pd.DataFrame(columns=["col 1", "col 2"], data=[["row 1", "row 1"], ["row 2", "row 2"]]), + content_type="table", + ), + ] + + route_documents = RouteDocuments() + result, _ = route_documents.run(documents=docs) + assert len(result["output_1"]) == 1 + assert len(result["output_2"]) == 1 + assert result["output_1"][0].content_type == "text" + assert result["output_2"][0].content_type == "table" + + +def test_route_documents_by_metafield(test_docs_xs): + # Test routing by metadata field + docs = [Document.from_dict(doc) if isinstance(doc, dict) else doc for doc in test_docs_xs] + route_documents = RouteDocuments(split_by="meta_field", metadata_values=["test1", "test3", "test5"]) + result, _ = route_documents.run(docs) + assert len(result["output_1"]) == 1 + assert len(result["output_2"]) == 1 + assert len(result["output_3"]) == 1 + assert result["output_1"][0].meta["meta_field"] == "test1" + assert result["output_2"][0].meta["meta_field"] == "test3" + assert result["output_3"][0].meta["meta_field"] == "test5" + + +@pytest.mark.parametrize("join_mode", ["concatenate", "merge"]) +def test_join_answers(join_mode): + inputs = [{"answers": [Answer(answer="answer 1", score=0.7)]}, {"answers": [Answer(answer="answer 2", score=0.8)]}] + + join_answers = JoinAnswers(join_mode=join_mode) + result, _ = join_answers.run(inputs) + assert len(result["answers"]) == 2 + assert result["answers"] == sorted(result["answers"], reverse=True) + + result, _ = join_answers.run(inputs, top_k_join=1) + assert len(result["answers"]) == 1 + assert result["answers"][0].answer == "answer 2" + + def clean_faiss_document_store(): if Path("existing_faiss_document_store").exists(): os.remove("existing_faiss_document_store") diff --git a/tutorials/Tutorial15_TableQA.ipynb b/tutorials/Tutorial15_TableQA.ipynb index b29637ea6..d448118ce 100644 --- a/tutorials/Tutorial15_TableQA.ipynb +++ b/tutorials/Tutorial15_TableQA.ipynb @@ -57,8 +57,9 @@ "# The TaPAs-based TableReader requires the torch-scatter library\n", "!pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu113.html\n", "\n", - "# If you run this notebook on Google Colab, you might need to\n", - "# restart the runtime after installing haystack." + "# Install pygraphviz for visualization of Pipelines\n", + "!apt install libgraphviz-dev\n", + "!pip install pygraphviz" ] }, { @@ -87,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": { "id": "S4PGj1A6wKWu" }, @@ -151,24 +152,24 @@ "from haystack.utils import fetch_archive_from_http\n", "\n", "doc_dir = \"data\"\n", - "s3_url = \"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/ottqa_tables_sample.json.zip\"\n", + "s3_url = \"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/ottqa_sample.zip\"\n", "fetch_archive_from_http(url=s3_url, output_dir=doc_dir)" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SKjw2LuXxlGh", - "outputId": "c24f8ca0-1a58-44ea-f01d-414db4c8f1f4" + "outputId": "92c67d24-d6fb-413e-8dd7-53075141d508" }, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ " Result ... Score\n", "0 Winner ... 6-1 , 6-1\n", @@ -276,7 +277,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "id": "XM-ijy6Zz11L" }, @@ -289,18 +290,18 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YHfQWxVI0N2e", - "outputId": "05976ac9-bee3-4eb8-b36d-01f1db5250db" + "outputId": "1d8dc4d2-a184-489e-defa-d445d76c458f" }, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ " Name ... Status\n", "0 Twin Towers II ... Never built\n", @@ -364,18 +365,18 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ILuAXkyN4F7x", - "outputId": "7bdb7190-fcf8-4296-c237-cffc78dac4aa" + "outputId": "4bd19dcb-df8e-4a4d-b9d2-d34650e9e5c2" }, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ " Name ... Status\n", "0 Twin Towers II ... Never built\n", @@ -412,20 +413,23 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ilbsecgA4vfN", - "outputId": "5f4e8f0b-bc9e-485b-c933-546fcad2b411" + "outputId": "f845f43e-43e8-48fe-d0ef-91b17a5eff0e" }, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ - "{ 'answers': [ Answer(answer='12', type='extractive', score=1.0, context= Name ... Status\n", + "\n", + "Query: How many twin buildings are under construction?\n", + "Answers:\n", + "[ ]\n" ] } ], @@ -472,18 +475,18 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "It8XYT2ZTVJs", - "outputId": "5bd712a0-9f22-4fc0-a4f1-b01b15cb9916" + "outputId": "7d31af60-e04a-485d-f0ee-f29592b03928" }, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Predicted answer: 12\n", "Meta field: {'aggregation_operator': 'COUNT', 'answer_cells': ['Three Sixty West', 'Gateway Towers', 'Rustomjee Crown', 'Lokhandwala Minerva', 'Lamar Towers', 'Indonesia One Towers', 'India Bulls Sky Forest Tower', 'Capital Towers', 'One Avighna Park', 'The Destiny ( Tower )', 'Oberoi Esquire Towers', 'Bhoomi Celestia']}\n" @@ -509,7 +512,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": { "id": "G-aZZvyv4-Mf" }, @@ -525,19 +528,22 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "m8evexnW6dev", - "outputId": "290168b1-294e-42ed-c970-e5ddfefb3396" + "outputId": "40514084-f516-4f13-fb48-6a55cb578366" }, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ + "\n", + "Query: How many twin buildings are under construction?\n", + "Answers:\n", "[ { 'answer': '12',\n", " 'context': Name ... Status\n", "0 Twin Towers II ... Never built\n", @@ -717,6 +723,596 @@ "print_answers(prediction, details=\"minimum\")" ] }, + { + "cell_type": "markdown", + "source": [ + "# Open-Domain QA on Text and Tables\n", + "With haystack, you not only have the possibility to do QA on texts or tables, solely, but you can also use both texts and tables as your source of information.\n", + "\n", + "To demonstrate this, we add 1,000 sample text passages from the OTT-QA dataset." + ], + "metadata": { + "id": "8uMzl9Ml_D1B" + } + }, + { + "cell_type": "code", + "source": [ + "# Add 1,000 text passages from OTT-QA to our document store.\n", + "\n", + "\n", + "def read_ottqa_texts(filename):\n", + " processed_passages = []\n", + " with open(filename) as passages:\n", + " passages = json.load(passages)\n", + " for title, content in passages.items():\n", + " title = title[6:]\n", + " title = title.replace(\"_\", \" \")\n", + " document = Document(content=content, content_type=\"text\", meta={\"title\": title})\n", + " processed_passages.append(document)\n", + "\n", + " return processed_passages\n", + "\n", + "\n", + "passages = read_ottqa_texts(f\"{doc_dir}/ottqa_texts_sample.json\")\n", + "document_store.write_documents(passages, index=document_index)" + ], + "metadata": { + "id": "4CBcIjIq_uFx" + }, + "execution_count": 18, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "document_store.update_embeddings(retriever=retriever, update_existing_embeddings=False)" + ], + "metadata": { + "id": "j1TaNF7SiKgH" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Pipeline for QA on Combination of Text and Tables\n", + "We are using one node for retrieving both texts and tables, the `TableTextRetriever`. In order to do question-answering on the Documents coming from the `TableTextRetriever`, we need to route Documents of type `\"text\"` to a `FARMReader` (or alternatively `TransformersReader`) and Documents of type `\"table\"` to a `TableReader`.\n", + "\n", + "To achieve this, we make use of two additional nodes:\n", + "- `SplitDocumentList`: Splits the List of Documents retrieved by the `TableTextRetriever` into two lists containing only Documents of type `\"text\"` or `\"table\"`, respectively.\n", + "- `JoinAnswers`: Takes Answers coming from two different Readers (in this case `FARMReader` and `TableReader`) and joins them to a single list of Answers." + ], + "metadata": { + "id": "c2sk_uNHj0DY" + } + }, + { + "cell_type": "code", + "source": [ + "from haystack.nodes import FARMReader, RouteDocuments, JoinAnswers\n", + "\n", + "text_reader = FARMReader(\"deepset/roberta-base-squad2\")\n", + "# In order to get meaningful scores from the TableReader, use \"deepset/tapas-large-nq-hn-reader\" or\n", + "# \"deepset/tapas-large-nq-reader\" as TableReader models. The disadvantage of these models is, however,\n", + "# that they are not capable of doing aggregations over multiple table cells.\n", + "table_reader = TableReader(\"deepset/tapas-large-nq-hn-reader\")\n", + "route_documents = RouteDocuments()\n", + "join_answers = JoinAnswers()" + ], + "metadata": { + "id": "Ej_j8Q3wlxXE" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "text_table_qa_pipeline = Pipeline()\n", + "text_table_qa_pipeline.add_node(component=retriever, name=\"TableTextRetriever\", inputs=[\"Query\"])\n", + "text_table_qa_pipeline.add_node(component=route_documents, name=\"RouteDocuments\", inputs=[\"TableTextRetriever\"])\n", + "text_table_qa_pipeline.add_node(component=text_reader, name=\"TextReader\", inputs=[\"RouteDocuments.output_1\"])\n", + "text_table_qa_pipeline.add_node(component=table_reader, name=\"TableReader\", inputs=[\"RouteDocuments.output_2\"])\n", + "text_table_qa_pipeline.add_node(component=join_answers, name=\"JoinAnswers\", inputs=[\"TextReader\", \"TableReader\"])" + ], + "metadata": { + "id": "Zdq6JnF5m3aP" + }, + "execution_count": 21, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Let's have a look on the structure of the combined Table an Text QA pipeline.\n", + "from IPython import display\n", + "\n", + "text_table_qa_pipeline.draw()\n", + "display.Image(\"pipeline.png\")" + ], + "metadata": { + "id": "K4vH1ZEnniut", + "outputId": "85aa17a8-227d-40e4-c8c0-5d0532faa47a", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 540 + } + }, + "execution_count": 22, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 22 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Example query whose answer resides in a text passage\n", + "predictions = text_table_qa_pipeline.run(query=\"Who is Aleksandar Trifunovic?\")" + ], + "metadata": { + "id": "strPNduPoBLe" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# We can see both text passages and tables as contexts of the predicted answers.\n", + "print_answers(predictions, details=\"minimum\")" + ], + "metadata": { + "id": "9YiK75tSoOGA", + "outputId": "bd52f841-3846-441f-dd6f-53b02111691e", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Query: Who is Aleksandar Trifunovic?\n", + "Answers:\n", + "[ { 'answer': 'a Serbian professional basketball coach and former player',\n", + " 'context': 'Aleksandar Trifunović ( ; born 30 May 1967 ) is a Serbian '\n", + " 'professional basketball coach and former player .'},\n", + " { 'answer': 'Johnny Höglin',\n", + " 'context': Rank Athlete Country Time\n", + "0 1 Kees Verkerk Netherlands 2:03.4\n", + "1 2 Ivar Eriksen Norway 2:05.0\n", + "2 3 Ard Schenk Netherlands 2:05.0\n", + "3 4 Magne Thomassen Norway 2:05.1\n", + "4 5 Johnny Höglin Sweden 2:05.2\n", + "5 5 Bjørn Tveter Norway 2:05.2\n", + "6 7 Svein-Erik Stiansen Norway 2:05.5\n", + "7 8 Eduard Matusevich Soviet Union 2:06.1\n", + "8 9 Peter Nottet Netherlands 2:06.3\n", + "9 10 Örjan Sandler Sweden 2:07.0\n", + "10 11 Aleksandr Kerchenko Soviet Union 2:07.1\n", + "11 12 Ants Antson Soviet Union 2:07.2\n", + "12 12 Valery Kaplan Soviet Union 2:07.2\n", + "13 14 Jouko Launonen Finland 2:07.5\n", + "14 15 Günter Traub West Germany 2:07.7\n", + "15 16 Jan Bols Netherlands 2:07.8\n", + "16 16 Manne Lavås Sweden 2:07.8\n", + "17 18 Kimmo Koskinen Finland 2:07.9\n", + "18 19 Richard Wurster United States 2:08.4\n", + "19 20 Göran Claeson Sweden 2:08.6},\n", + " { 'answer': 'Ivar Eriksen',\n", + " 'context': Rank Athlete Country Time\n", + "0 1 Kees Verkerk Netherlands 2:03.4\n", + "1 2 Ivar Eriksen Norway 2:05.0\n", + "2 3 Ard Schenk Netherlands 2:05.0\n", + "3 4 Magne Thomassen Norway 2:05.1\n", + "4 5 Johnny Höglin Sweden 2:05.2\n", + "5 5 Bjørn Tveter Norway 2:05.2\n", + "6 7 Svein-Erik Stiansen Norway 2:05.5\n", + "7 8 Eduard Matusevich Soviet Union 2:06.1\n", + "8 9 Peter Nottet Netherlands 2:06.3\n", + "9 10 Örjan Sandler Sweden 2:07.0\n", + "10 11 Aleksandr Kerchenko Soviet Union 2:07.1\n", + "11 12 Ants Antson Soviet Union 2:07.2\n", + "12 12 Valery Kaplan Soviet Union 2:07.2\n", + "13 14 Jouko Launonen Finland 2:07.5\n", + "14 15 Günter Traub West Germany 2:07.7\n", + "15 16 Jan Bols Netherlands 2:07.8\n", + "16 16 Manne Lavås Sweden 2:07.8\n", + "17 18 Kimmo Koskinen Finland 2:07.9\n", + "18 19 Richard Wurster United States 2:08.4\n", + "19 20 Göran Claeson Sweden 2:08.6},\n", + " { 'answer': 'Magne Thomassen',\n", + " 'context': Rank Athlete Country Time\n", + "0 1 Kees Verkerk Netherlands 2:03.4\n", + "1 2 Ivar Eriksen Norway 2:05.0\n", + "2 3 Ard Schenk Netherlands 2:05.0\n", + "3 4 Magne Thomassen Norway 2:05.1\n", + "4 5 Johnny Höglin Sweden 2:05.2\n", + "5 5 Bjørn Tveter Norway 2:05.2\n", + "6 7 Svein-Erik Stiansen Norway 2:05.5\n", + "7 8 Eduard Matusevich Soviet Union 2:06.1\n", + "8 9 Peter Nottet Netherlands 2:06.3\n", + "9 10 Örjan Sandler Sweden 2:07.0\n", + "10 11 Aleksandr Kerchenko Soviet Union 2:07.1\n", + "11 12 Ants Antson Soviet Union 2:07.2\n", + "12 12 Valery Kaplan Soviet Union 2:07.2\n", + "13 14 Jouko Launonen Finland 2:07.5\n", + "14 15 Günter Traub West Germany 2:07.7\n", + "15 16 Jan Bols Netherlands 2:07.8\n", + "16 16 Manne Lavås Sweden 2:07.8\n", + "17 18 Kimmo Koskinen Finland 2:07.9\n", + "18 19 Richard Wurster United States 2:08.4\n", + "19 20 Göran Claeson Sweden 2:08.6},\n", + " { 'answer': '5',\n", + " 'context': Position # Player Moving from\n", + "0 F 12 Nikola Kalinić Radnički Kragujevac\n", + "1 SF 6 Nemanja Dangubić Mega Vizura\n", + "2 C 33 Maik Zirbes Brose Baskets\n", + "3 PG 3 Marcus Williams Lokomotiv Kuban\n", + "4 PG 24 Stefan Jović Radnički Kragujevac\n", + "5 C 14 Đorđe Kaplanović FMP\n", + "6 SF 5 Nikola Čvorović FMP\n", + "7 SG 7 Aleksandar Aranitović Crvena zvezda U18\n", + "8 SG 20 Aleksa Radanov Crvena zvezda U18},\n", + " { 'answer': 'Vasile Sărucan',\n", + " 'context': Rank Name Nationality Result\n", + "0 1 Hans Baumgartner West Germany 8.12\n", + "1 2 Igor Ter-Ovanesyan Soviet Union 7.91\n", + "2 3 Vasile Sărucan Romania 7.88\n", + "3 4 Valeriu Jurcă Romania 7.72\n", + "4 5 Philippe Housiaux Belgium 7.70\n", + "5 6 Andreas Gloerfeld West Germany 7.70\n", + "6 7 Jan Kobuszewski Poland 7.66\n", + "7 8 Jaroslav Brož Czechoslovakia 7.66\n", + "8 9 Alan Lerwill Great Britain 7.61\n", + "9 10 Mikhail Bariban Soviet Union 7.58\n", + "10 11 Valeriy Podluzhniy Soviet Union 7.54\n", + "11 12 Kari Palmen Finland 7.51\n", + "12 13 Georgi Marin Bulgaria 7.51\n", + "13 14 Jesper Tørring Denmark 7.46\n", + "14 15 Milan Spasojević Yugoslavia 7.23\n", + "15 16 Salih Mercan Turkey 6.98\n", + "16 17 Henrik Kalocsai Hungary 5.67},\n", + " { 'answer': 'Belgium',\n", + " 'context': Rank Name Nationality Result\n", + "0 1 Hans Baumgartner West Germany 8.12\n", + "1 2 Igor Ter-Ovanesyan Soviet Union 7.91\n", + "2 3 Vasile Sărucan Romania 7.88\n", + "3 4 Valeriu Jurcă Romania 7.72\n", + "4 5 Philippe Housiaux Belgium 7.70\n", + "5 6 Andreas Gloerfeld West Germany 7.70\n", + "6 7 Jan Kobuszewski Poland 7.66\n", + "7 8 Jaroslav Brož Czechoslovakia 7.66\n", + "8 9 Alan Lerwill Great Britain 7.61\n", + "9 10 Mikhail Bariban Soviet Union 7.58\n", + "10 11 Valeriy Podluzhniy Soviet Union 7.54\n", + "11 12 Kari Palmen Finland 7.51\n", + "12 13 Georgi Marin Bulgaria 7.51\n", + "13 14 Jesper Tørring Denmark 7.46\n", + "14 15 Milan Spasojević Yugoslavia 7.23\n", + "15 16 Salih Mercan Turkey 6.98\n", + "16 17 Henrik Kalocsai Hungary 5.67},\n", + " { 'answer': 'Poland',\n", + " 'context': Rank Name Nationality Result\n", + "0 1 Hans Baumgartner West Germany 8.12\n", + "1 2 Igor Ter-Ovanesyan Soviet Union 7.91\n", + "2 3 Vasile Sărucan Romania 7.88\n", + "3 4 Valeriu Jurcă Romania 7.72\n", + "4 5 Philippe Housiaux Belgium 7.70\n", + "5 6 Andreas Gloerfeld West Germany 7.70\n", + "6 7 Jan Kobuszewski Poland 7.66\n", + "7 8 Jaroslav Brož Czechoslovakia 7.66\n", + "8 9 Alan Lerwill Great Britain 7.61\n", + "9 10 Mikhail Bariban Soviet Union 7.58\n", + "10 11 Valeriy Podluzhniy Soviet Union 7.54\n", + "11 12 Kari Palmen Finland 7.51\n", + "12 13 Georgi Marin Bulgaria 7.51\n", + "13 14 Jesper Tørring Denmark 7.46\n", + "14 15 Milan Spasojević Yugoslavia 7.23\n", + "15 16 Salih Mercan Turkey 6.98\n", + "16 17 Henrik Kalocsai Hungary 5.67},\n", + " { 'answer': 'Hafþór Júlíus Björnsson',\n", + " 'context': # Name Nationality Pts\n", + "0 1 Hafþór Júlíus Björnsson Iceland 31.5\n", + "1 2 Robert Oberst United States 29\n", + "2 3 Lauri Nami Estonia 24\n", + "3 4 Nick Best United States 14.5\n", + "4 5 Laurence Shahlaei UK 12\n", + "5 6 Wu Long China 6},\n", + " { 'answer': 'Estonia',\n", + " 'context': # Name Nationality Pts\n", + "0 1 Hafþór Júlíus Björnsson Iceland 31.5\n", + "1 2 Robert Oberst United States 29\n", + "2 3 Lauri Nami Estonia 24\n", + "3 4 Nick Best United States 14.5\n", + "4 5 Laurence Shahlaei UK 12\n", + "5 6 Wu Long China 6},\n", + " { 'answer': 'Iceland',\n", + " 'context': # Name Nationality Pts\n", + "0 1 Hafþór Júlíus Björnsson Iceland 31.5\n", + "1 2 Robert Oberst United States 29\n", + "2 3 Lauri Nami Estonia 24\n", + "3 4 Nick Best United States 14.5\n", + "4 5 Laurence Shahlaei UK 12\n", + "5 6 Wu Long China 6},\n", + " { 'answer': 'Egor Antropov ( born May 8 , 1992 ) is a Russian '\n", + " 'professional ice hockey defenceman',\n", + " 'context': 'Egor Antropov ( born May 8 , 1992 ) is a Russian '\n", + " 'professional ice hockey defenceman . He is currently '\n", + " 'playing with Piráti Chomutov of the Czech Extral'},\n", + " { 'answer': 'Zurab Magomedovich Yevloyev ( ; born February 20 , 1980 ) '\n", + " 'is a Russian professional football player',\n", + " 'context': 'Zurab Magomedovich Yevloyev ( ; born February 20 , 1980 ) '\n", + " 'is a Russian professional football player . In 2010 , he '\n", + " 'played for FC Angusht Nazran in the'}]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Example query whose answer resides in a table\n", + "predictions = text_table_qa_pipeline.run(query=\"What is Cuba's national tree?\")" + ], + "metadata": { + "id": "QYOHDSmLpzEg" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# We can see both text passages and tables as contexts of the predicted answers.\n", + "print_answers(predictions, details=\"minimum\")" + ], + "metadata": { + "id": "4kw53uWep3zj", + "outputId": "b332cc17-3cb8-4e20-d79d-bb4cf656f277", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 26, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Query: What is Cuba's national tree?\n", + "Answers:\n", + "[ { 'answer': 'Cuban royal palm',\n", + " 'context': Country ... Scientific name\n", + "0 Afghanistan ... \n", + "1 Albania ... Olea europaea\n", + "2 Antigua and Barbuda ... Bucida buceras\n", + "3 Argentina ... Erythrina crista-galli , Schinopsis balansae\n", + "4 Australia ... Acacia pycnantha\n", + "5 Bahamas ... Guaiacum sanctum\n", + "6 Bangladesh ... Mangifera indica\n", + "7 Belize ... Swietenia macrophylla\n", + "8 Bhutan ... Cupressus cashmeriana\n", + "9 Brazil ... Caesalpinia echinata\n", + "10 Cambodia ... Borassus flabellifer\n", + "11 Canada ... Acer\n", + "12 Chile ... Araucaria araucana\n", + "13 Colombia ... Ceroxylon quindiuense\n", + "14 Costa Rica ... Enterolobium cyclocarpum\n", + "15 Croatia ... Quercus robur\n", + "16 Cuba ... Roystonea regia\n", + "17 Cyprus ... Quercus alnifolia\n", + "18 Czech Republic ... Tilia cordata\n", + "19 Denmark ... Fagus sylvatica\n", + "\n", + "[20 rows x 3 columns]},\n", + " { 'answer': 'Quercus sagraeana , the Cuban oak',\n", + " 'context': 'Quercus sagraeana , the Cuban oak , is a medium-sized '\n", + " 'evergreen tree native to western Cuba in the Cuban pine '\n", + " 'forests ecoregion .'},\n", + " { 'answer': \"Glenn O'Brien\",\n", + " 'context': Book title ... Notes\n", + "0 Sex ... The book contains erotica influenced photographs taken by Steven Meisel and ...\n", + "1 Madonna : The Girlie Show ... The photographs in the book showcased behind-the-scenes of the 1993 Girlie S...\n", + "2 The Making of Evita ... Featuring an introduction by Madonna , The Making of Evita chronicles the cr...\n", + "3 The Emperor 's New Clothes : An All-Star Retelling of the Classic Fairy Tale ... This fully illustrated retelling of the classic fairy tale by Hans Christian...\n", + "4 X-Static Process ... In 2002 , Madonna had collaborated with photographer Steven Klein for an art...\n", + "5 Nobody Knows Me ... Available for one month only via Madonna 's official website . Contained 52 ...\n", + "6 Madonna Confessions ... Behind-the-scenes and on-stage pictures from Madonna 's 2006 Confessions Tou...\n", + "7 I Am Because We Are ... The book contains excerpts from interviews with Malawian children , their bi...\n", + "8 Madonna : Sticky & Sweet ... Behind-the-scenes and on-stage photography from Madonna 's Sticky & Sweet To...\n", + "9 Tom Munro ... Munro 's self-titled first monograph book consists of photographs taken by h...\n", + "10 Mayumi 's Kitchen : Macrobiotic Cooking for Body and Soul ... Mayumi Nishimura worked as Madonna 's private chef for seven years , and she...\n", + "\n", + "[11 rows x 6 columns]},\n", + " { 'answer': 'Guy Oseary',\n", + " 'context': Book title ... Notes\n", + "0 Sex ... The book contains erotica influenced photographs taken by Steven Meisel and ...\n", + "1 Madonna : The Girlie Show ... The photographs in the book showcased behind-the-scenes of the 1993 Girlie S...\n", + "2 The Making of Evita ... Featuring an introduction by Madonna , The Making of Evita chronicles the cr...\n", + "3 The Emperor 's New Clothes : An All-Star Retelling of the Classic Fairy Tale ... This fully illustrated retelling of the classic fairy tale by Hans Christian...\n", + "4 X-Static Process ... In 2002 , Madonna had collaborated with photographer Steven Klein for an art...\n", + "5 Nobody Knows Me ... Available for one month only via Madonna 's official website . Contained 52 ...\n", + "6 Madonna Confessions ... Behind-the-scenes and on-stage pictures from Madonna 's 2006 Confessions Tou...\n", + "7 I Am Because We Are ... The book contains excerpts from interviews with Malawian children , their bi...\n", + "8 Madonna : Sticky & Sweet ... Behind-the-scenes and on-stage photography from Madonna 's Sticky & Sweet To...\n", + "9 Tom Munro ... Munro 's self-titled first monograph book consists of photographs taken by h...\n", + "10 Mayumi 's Kitchen : Macrobiotic Cooking for Body and Soul ... Mayumi Nishimura worked as Madonna 's private chef for seven years , and she...\n", + "\n", + "[11 rows x 6 columns]},\n", + " { 'answer': 'Guy Oseary',\n", + " 'context': Book title ... Notes\n", + "0 Sex ... The book contains erotica influenced photographs taken by Steven Meisel and ...\n", + "1 Madonna : The Girlie Show ... The photographs in the book showcased behind-the-scenes of the 1993 Girlie S...\n", + "2 The Making of Evita ... Featuring an introduction by Madonna , The Making of Evita chronicles the cr...\n", + "3 The Emperor 's New Clothes : An All-Star Retelling of the Classic Fairy Tale ... This fully illustrated retelling of the classic fairy tale by Hans Christian...\n", + "4 X-Static Process ... In 2002 , Madonna had collaborated with photographer Steven Klein for an art...\n", + "5 Nobody Knows Me ... Available for one month only via Madonna 's official website . Contained 52 ...\n", + "6 Madonna Confessions ... Behind-the-scenes and on-stage pictures from Madonna 's 2006 Confessions Tou...\n", + "7 I Am Because We Are ... The book contains excerpts from interviews with Malawian children , their bi...\n", + "8 Madonna : Sticky & Sweet ... Behind-the-scenes and on-stage photography from Madonna 's Sticky & Sweet To...\n", + "9 Tom Munro ... Munro 's self-titled first monograph book consists of photographs taken by h...\n", + "10 Mayumi 's Kitchen : Macrobiotic Cooking for Body and Soul ... Mayumi Nishimura worked as Madonna 's private chef for seven years , and she...\n", + "\n", + "[11 rows x 6 columns]},\n", + " { 'answer': 'Belize',\n", + " 'context': Country ... Scientific name\n", + "0 Afghanistan ... \n", + "1 Albania ... Olea europaea\n", + "2 Antigua and Barbuda ... Bucida buceras\n", + "3 Argentina ... Erythrina crista-galli , Schinopsis balansae\n", + "4 Australia ... Acacia pycnantha\n", + "5 Bahamas ... Guaiacum sanctum\n", + "6 Bangladesh ... Mangifera indica\n", + "7 Belize ... Swietenia macrophylla\n", + "8 Bhutan ... Cupressus cashmeriana\n", + "9 Brazil ... Caesalpinia echinata\n", + "10 Cambodia ... Borassus flabellifer\n", + "11 Canada ... Acer\n", + "12 Chile ... Araucaria araucana\n", + "13 Colombia ... Ceroxylon quindiuense\n", + "14 Costa Rica ... Enterolobium cyclocarpum\n", + "15 Croatia ... Quercus robur\n", + "16 Cuba ... Roystonea regia\n", + "17 Cyprus ... Quercus alnifolia\n", + "18 Czech Republic ... Tilia cordata\n", + "19 Denmark ... Fagus sylvatica\n", + "\n", + "[20 rows x 3 columns]},\n", + " { 'answer': 'Palmyra palm',\n", + " 'context': Country ... Scientific name\n", + "0 Afghanistan ... \n", + "1 Albania ... Olea europaea\n", + "2 Antigua and Barbuda ... Bucida buceras\n", + "3 Argentina ... Erythrina crista-galli , Schinopsis balansae\n", + "4 Australia ... Acacia pycnantha\n", + "5 Bahamas ... Guaiacum sanctum\n", + "6 Bangladesh ... Mangifera indica\n", + "7 Belize ... Swietenia macrophylla\n", + "8 Bhutan ... Cupressus cashmeriana\n", + "9 Brazil ... Caesalpinia echinata\n", + "10 Cambodia ... Borassus flabellifer\n", + "11 Canada ... Acer\n", + "12 Chile ... Araucaria araucana\n", + "13 Colombia ... Ceroxylon quindiuense\n", + "14 Costa Rica ... Enterolobium cyclocarpum\n", + "15 Croatia ... Quercus robur\n", + "16 Cuba ... Roystonea regia\n", + "17 Cyprus ... Quercus alnifolia\n", + "18 Czech Republic ... Tilia cordata\n", + "19 Denmark ... Fagus sylvatica\n", + "\n", + "[20 rows x 3 columns]},\n", + " { 'answer': 'Guadeloupe',\n", + " 'context': State ... Official Language ( s )\n", + "0 Antigua and Barbuda ... English\n", + "1 Dominica ... English\n", + "2 Grenada ... English\n", + "3 Montserrat ... English\n", + "4 Saint Kitts and Nevis ... English\n", + "5 Saint Lucia ... English\n", + "6 Saint Vincent and the Grenadines ... English\n", + "7 Anguilla ... English\n", + "8 British Virgin Islands ... English\n", + "9 Guadeloupe ... French\n", + "10 Martinique ... French\n", + "\n", + "[11 rows x 10 columns]},\n", + " { 'answer': 'Basse-Terre',\n", + " 'context': State ... Official Language ( s )\n", + "0 Antigua and Barbuda ... English\n", + "1 Dominica ... English\n", + "2 Grenada ... English\n", + "3 Montserrat ... English\n", + "4 Saint Kitts and Nevis ... English\n", + "5 Saint Lucia ... English\n", + "6 Saint Vincent and the Grenadines ... English\n", + "7 Anguilla ... English\n", + "8 British Virgin Islands ... English\n", + "9 Guadeloupe ... French\n", + "10 Martinique ... French\n", + "\n", + "[11 rows x 10 columns]},\n", + " { 'answer': 'East Caribbean dollar',\n", + " 'context': State ... Official Language ( s )\n", + "0 Antigua and Barbuda ... English\n", + "1 Dominica ... English\n", + "2 Grenada ... English\n", + "3 Montserrat ... English\n", + "4 Saint Kitts and Nevis ... English\n", + "5 Saint Lucia ... English\n", + "6 Saint Vincent and the Grenadines ... English\n", + "7 Anguilla ... English\n", + "8 British Virgin Islands ... English\n", + "9 Guadeloupe ... French\n", + "10 Martinique ... French\n", + "\n", + "[11 rows x 10 columns]},\n", + " { 'answer': 'Jenkins',\n", + " 'context': NRHP reference number ... County\n", + "0 72000402 ... Wilkes\n", + "1 ... Meriwether\n", + "2 ... Bartow\n", + "3 71000280 ... Jenkins\n", + "4 ... Chatham\n", + "5 89002015 ... Thomas\n", + "6 ... Glynn\n", + "7 75000615 ... Walton\n", + "8 84001156 ... Sumter\n", + "9 79000713 ... Cobb\n", + "10 82002491 ... Twiggs\n", + "11 74000703 ... Taliaferro\n", + "12 80001039 ... Floyd\n", + "13 90000805 ... Gwinnett\n", + "14 73000620 ... Decatur\n", + "15 79000731 ... Houston\n", + "16 95000741 ... Grady\n", + "17 97000559 ... Greene\n", + "18 74000662 ... Brooks\n", + "19 75000616 ... Washington\n", + "\n", + "[20 rows x 4 columns]},\n", + " { 'answer': \"Primula farinosa , the bird's-eye primrose\",\n", + " 'context': \"Primula farinosa , the bird's-eye primrose , is a small \"\n", + " 'perennial plant in the family Primulaceae , native to '\n", + " 'Northern Europe and northern Asia , and '},\n", + " { 'answer': 'Poospiza',\n", + " 'context': 'Poospiza is a genus of finch-like tanagers found in both '\n", + " 'the South American lowlands and the Andes mountains . '\n", + " 'Generally they are arboreal feeders in '},\n", + " { 'answer': 'golden-crowned sparrow',\n", + " 'context': 'The golden-crowned sparrow ( Zonotrichia atricapilla ) is '\n", + " 'a large American sparrow found in the western part of '\n", + " 'North America .'},\n", + " { 'answer': 'Banksia sessilis var . cordata is a variety of Banksia '\n", + " 'sessilis ( Parrot Bush',\n", + " 'context': 'Banksia sessilis var . cordata is a variety of Banksia '\n", + " 'sessilis ( Parrot Bush ) , with unusually large leaves and '\n", + " 'flower heads . It is a rare variety '},\n", + " { 'answer': 'rain',\n", + " 'context': 's and operates hotels at Machu Picchu Natural Reserve , '\n", + " 'the southeastern rain forest of the Amazon in Puerto '\n", + " 'Maldonado , Tambopata , the Sacred Valley'}]\n" + ] + } + ] + }, { "cell_type": "markdown", "metadata": { @@ -757,5 +1353,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/tutorials/Tutorial15_TableQA.py b/tutorials/Tutorial15_TableQA.py index 5c282e21c..b77e719de 100644 --- a/tutorials/Tutorial15_TableQA.py +++ b/tutorials/Tutorial15_TableQA.py @@ -6,7 +6,7 @@ from haystack.utils import launch_es, fetch_archive_from_http, print_answers from haystack.document_stores import ElasticsearchDocumentStore from haystack import Document, Pipeline from haystack.nodes.retriever import TableTextRetriever -from haystack.nodes import TableReader +from haystack.nodes import TableReader, FARMReader, RouteDocuments, JoinAnswers def tutorial15_tableqa(): @@ -115,6 +115,37 @@ def tutorial15_tableqa(): prediction = table_qa_pipeline.run("How many twin buildings are under construction?") print_answers(prediction, details="minimum") + ### Pipeline for QA on Combination of Text and Tables + # We are using one node for retrieving both texts and tables, the TableTextRetriever. + # In order to do question-answering on the Documents coming from the TableTextRetriever, we need to route + # Documents of type "text" to a FARMReader ( or alternatively TransformersReader) and Documents of type + # "table" to a TableReader. + + text_reader = FARMReader("deepset/roberta-base-squad2") + # In order to get meaningful scores from the TableReader, use "deepset/tapas-large-nq-hn-reader" or + # "deepset/tapas-large-nq-reader" as TableReader models. The disadvantage of these models is, however, + # that they are not capable of doing aggregations over multiple table cells. + table_reader = TableReader("deepset/tapas-large-nq-hn-reader") + route_documents = RouteDocuments() + join_answers = JoinAnswers() + + text_table_qa_pipeline = Pipeline() + text_table_qa_pipeline.add_node(component=retriever, name="TableTextRetriever", inputs=["Query"]) + text_table_qa_pipeline.add_node(component=route_documents, name="RouteDocuments", inputs=["TableTextRetriever"]) + text_table_qa_pipeline.add_node(component=text_reader, name="TextReader", inputs=["RouteDocuments.output_1"]) + text_table_qa_pipeline.add_node(component=table_reader, name="TableReader", inputs=["RouteDocuments.output_2"]) + text_table_qa_pipeline.add_node(component=join_answers, name="JoinAnswers", inputs=["TextReader", "TableReader"]) + + # Example query whose answer resides in a text passage + predictions = text_table_qa_pipeline.run(query="Who is Aleksandar Trifunovic?") + # We can see both text passages and tables as contexts of the predicted answers. + print_answers(predictions, details="minimum") + + # Example query whose answer resides in a table + predictions = text_table_qa_pipeline.run(query="What is Cuba's national tree?") + # We can see both text passages and tables as contexts of the predicted answers. + print_answers(predictions, details="minimum") + if __name__ == "__main__": tutorial15_tableqa()