haystack/test/test_table_reader.py
bogdankostic 655d721371
Add Table Reader (#1446)
* first draft / notes on new primitives

* wip label / feedback refactor

* rename doc.text -> doc.content. add doc.content_type

* add datatype for content

* remove faq_question_field from ES and weaviate. rename text_field -> content_field in docstores. update tutorials for content field

* update converters for . Add warning for empty

* Add first draft of TableReader

* renam label.question -> label.query. Allow sorting of Answers.

* Add calculation of answer scores

* WIP primitives

* Adapt input and output to new primitives

* Add doc strings

* Add tests

* update ui/reader for new Answer format

* Improve Label. First refactoring of MultiLabel. Adjust eval code

* fixed workflow conflict with introducing new one (#1472)

* Add latest docstring and tutorial changes

* make add_eval_data() work again

* fix reader formats. WIP fix _extract_docs_and_labels_from_dict

* fix test reader

* Add latest docstring and tutorial changes

* fix another test case for reader

* fix mypy in farm reader.eval()

* fix mypy in farm reader.eval()

* WIP ORM refactor

* Add latest docstring and tutorial changes

* fix mypy weaviate

* make label and multilabel dataclasses

* bump mypy env in CI to python 3.8

* WIP refactor Label ORM

* WIP refactor Label ORM

* simplify tests for individual doc stores

* WIP refactoring markers of tests

* test alternative approach for tests with existing parametrization

* WIP refactor ORMs

* fix skip logic of already parametrized tests

* fix weaviate behaviour in tests - not parametrizing it in our general test cases.

* Add latest docstring and tutorial changes

* fix some tests

* remove sql from document_store_types

* fix markers for generator and pipeline test

* remove inmemory marker

* remove unneeded elasticsearch markers

* add dataclasses-json dependency. adjust ORM to just store JSON repr

* ignore type as dataclasses_json seems to miss functionality here

* update readme and contributing.md

* update contributing

* adjust example

* fix duplicate doc handling for custom index

* Add latest docstring and tutorial changes

* fix some ORM issues. fix get_all_labels_aggregated.

* update drop flags where get_all_labels_aggregated() was used before

* Add latest docstring and tutorial changes

* add to_json(). add + fix tests

* fix no_answer handling in label / multilabel

* fix duplicate docs in memory doc store. change primary key for sql doc table

* fix mypy issues

* fix mypy issues

* haystack/retriever/base.py

* fix test_write_document_meta[elastic]

* fix test_elasticsearch_custom_fields

* fix test_labels[elastic]

* fix crawler

* fix converter

* fix docx converter

* fix preprocessor

* fix test_utils

* fix tfidf retriever. fix selection of docstore in tests with multiple fixtures / parameterizations

* Add latest docstring and tutorial changes

* fix crawler test. fix ocrconverter attribute

* fix test_elasticsearch_custom_query

* fix generator pipeline

* fix ocr converter

* fix ragenerator

* Add latest docstring and tutorial changes

* fix test_load_and_save_yaml for elasticsearch

* fixes for pipeline tests

* fix faq pipeline

* fix pipeline tests

* Add latest docstring and tutorial changes

* fix weaviate

* Add latest docstring and tutorial changes

* trigger CI

* satisfy mypy

* Add latest docstring and tutorial changes

* satisfy mypy

* Add latest docstring and tutorial changes

* trigger CI

* fix question generation test

* fix ray. fix Q-generation

* fix translator test

* satisfy mypy

* wip refactor feedback rest api

* fix rest api feedback endpoint

* fix doc classifier

* remove relation of Labels -> Docs in SQL ORM

* fix faiss/milvus tests

* fix doc classifier test

* fix eval test

* fixing eval issues

* Add latest docstring and tutorial changes

* fix mypy

* WIP replace dataclasses-json with manual serialization

* Add latest docstring and tutorial changes

* revert to dataclass-json serialization for now. remove debug prints.

* update docstrings

* fix extractor. fix Answer Span init

* fix api test

* Adapt answer format

* Add latest docstring and tutorial changes

* keep meta data of answers in reader.run()

* Fix mypy

* fix meta handling

* adress review feedback

* Add latest docstring and tutorial changes

* Allow inference on GPU

* Remove automatic aggregation

* Add automatic aggregation

* Add latest docstring and tutorial changes

* Add torch-scatter dependency

* Add wheel to torch-scatter dependency

* Fix requirements

* Fix requirements

* Fix requirements

* Adapt setup.py to allow for wheels

* Fix requirements

* Fix requirements

* Add type hints and code snippet

* Add latest docstring and tutorial changes

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
Co-authored-by: Markus Paff <markuspaff.mp@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2021-10-15 16:34:48 +02:00

75 lines
3.4 KiB
Python

import pandas as pd
from haystack import Document, Pipeline
def test_table_reader(table_reader):
data = {
"actors": ["brad pitt", "leonardo di caprio", "george clooney"],
"age": ["57", "46", "60"],
"number of movies": ["87", "53", "69"],
"date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
}
table = pd.DataFrame(data)
query = "When was DiCaprio born?"
prediction = table_reader.predict(query=query, documents=[Document(content=table, content_type="table")])
assert prediction["answers"][0].answer == "10 june 1996"
assert prediction["answers"][0].offsets_in_context[0].start == 7
assert prediction["answers"][0].offsets_in_context[0].end == 8
# test aggregation
query = "How old are DiCaprio and Pitt on average?"
prediction = table_reader.predict(query=query, documents=[Document(content=table, content_type="table")])
assert prediction["answers"][0].answer == "51.5"
assert prediction["answers"][0].meta["answer_cells"] == ["57", "46"]
assert prediction["answers"][0].meta["aggregation_operator"] == "AVERAGE"
assert prediction["answers"][0].offsets_in_context[0].start == 1
assert prediction["answers"][0].offsets_in_context[0].end == 2
assert prediction["answers"][0].offsets_in_context[1].start == 5
assert prediction["answers"][0].offsets_in_context[1].end == 6
def test_table_reader_in_pipeline(table_reader):
pipeline = Pipeline()
pipeline.add_node(table_reader, "TableReader", ["Query"])
data = {
"actors": ["brad pitt", "leonardo di caprio", "george clooney"],
"age": ["57", "46", "60"],
"number of movies": ["87", "53", "69"],
"date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
}
table = pd.DataFrame(data)
query = "Which actors played in more than 60 movies?"
prediction = pipeline.run(query=query, documents=[Document(content=table, content_type="table")])
assert prediction["answers"][0].answer == "brad pitt, george clooney"
assert prediction["answers"][0].meta["aggregation_operator"] == "NONE"
assert prediction["answers"][0].offsets_in_context[0].start == 0
assert prediction["answers"][0].offsets_in_context[0].end == 1
assert prediction["answers"][0].offsets_in_context[1].start == 8
assert prediction["answers"][0].offsets_in_context[1].end == 9
def test_table_reader_aggregation(table_reader):
data = {
"Mountain": ["Mount Everest", "K2", "Kangchenjunga", "Lhotse", "Makalu"],
"Height": ["8848m", "8,611 m", "8 586m", "8 516 m", "8,485m"]
}
table = pd.DataFrame(data)
query = "How tall are all mountains on average?"
prediction = table_reader.predict(query=query, documents=[Document(content=table, content_type="table")])
assert prediction["answers"][0].answer == "8609.2 m"
assert prediction["answers"][0].meta["aggregation_operator"] == "AVERAGE"
assert prediction["answers"][0].meta["answer_cells"] == ['8848m', '8,611 m', '8 586m', '8 516 m', '8,485m']
query = "How tall are all mountains together?"
prediction = table_reader.predict(query=query, documents=[Document(content=table, content_type="table")])
assert prediction["answers"][0].answer == "43046.0 m"
assert prediction["answers"][0].meta["aggregation_operator"] == "SUM"
assert prediction["answers"][0].meta["answer_cells"] == ['8848m', '8,611 m', '8 586m', '8 516 m', '8,485m']