mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-31 09:49:48 +00:00 
			
		
		
		
	 738e008020
			
		
	
	
		738e008020
		
			
		
	
	
	
	
		
			
			* Add run_batch methods for batch querying * Update Documentation & Code Style * Fix mypy * Update Documentation & Code Style * Fix mypy * Fix linter * Fix tests * Update Documentation & Code Style * Fix tests * Update Documentation & Code Style * Fix mypy * Fix rest api test * Update Documentation & Code Style * Add Doc strings * Update Documentation & Code Style * Add batch_size as attribute to nodes supporting batching * Adapt error messages * Adapt type of filters in retrievers * Revert change about truncation_warning in summarizer * Unify multiple_doc_lists tests * Use smaller models in extractor tests * Add return types to JoinAnswers and RouteDocuments * Adapt return statements in reader's run_batch method * Allow list of filters * Adapt error messages * Update Documentation & Code Style * Fix tests * Fix mypy * Adapt print_questions * Remove disabling warning about too many public methods * Add flag for pylint to disable warning about too many public methods in pipelines/base.py and document_stores/base.py * Add type check * Update Documentation & Code Style * Adapt tutorial 11 * Update Documentation & Code Style * Add query_batch method for DCDocStore * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
		
			
				
	
	
		
			161 lines
		
	
	
		
			6.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			161 lines
		
	
	
		
			6.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import logging
 | |
| 
 | |
| import pandas as pd
 | |
| import pytest
 | |
| 
 | |
| from haystack.schema import Document, Answer
 | |
| from haystack.pipelines.base import Pipeline
 | |
| 
 | |
| 
 | |
| def test_table_reader(table_reader):
 | |
|     data = {
 | |
|         "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
 | |
|         "age": ["58", "47", "60"],
 | |
|         "number of movies": ["87", "53", "69"],
 | |
|         "date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"],
 | |
|     }
 | |
|     table = pd.DataFrame(data)
 | |
| 
 | |
|     query = "When was Di Caprio born?"
 | |
|     prediction = table_reader.predict(query=query, documents=[Document(content=table, content_type="table")])
 | |
|     assert prediction["answers"][0].answer == "11 november 1974"
 | |
|     assert prediction["answers"][0].offsets_in_context[0].start == 7
 | |
|     assert prediction["answers"][0].offsets_in_context[0].end == 8
 | |
| 
 | |
| 
 | |
| def test_table_reader_batch_single_query_single_doc_list(table_reader):
 | |
|     data = {
 | |
|         "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
 | |
|         "age": ["58", "47", "60"],
 | |
|         "number of movies": ["87", "53", "69"],
 | |
|         "date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"],
 | |
|     }
 | |
|     table = pd.DataFrame(data)
 | |
| 
 | |
|     query = "When was Di Caprio born?"
 | |
|     prediction = table_reader.predict_batch(queries=query, documents=[Document(content=table, content_type="table")])
 | |
|     # Expected output: List of lists of answers
 | |
|     assert isinstance(prediction["answers"], list)
 | |
|     assert isinstance(prediction["answers"][0], list)
 | |
|     assert isinstance(prediction["answers"][0][0], Answer)
 | |
|     assert len(prediction["answers"]) == 1  # Predictions for 5 docs
 | |
| 
 | |
| 
 | |
| def test_table_reader_batch_single_query_multiple_doc_lists(table_reader):
 | |
|     data = {
 | |
|         "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
 | |
|         "age": ["58", "47", "60"],
 | |
|         "number of movies": ["87", "53", "69"],
 | |
|         "date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"],
 | |
|     }
 | |
|     table = pd.DataFrame(data)
 | |
| 
 | |
|     query = "When was Di Caprio born?"
 | |
|     prediction = table_reader.predict_batch(queries=query, documents=[[Document(content=table, content_type="table")]])
 | |
|     # Expected output: List of lists of answers
 | |
|     assert isinstance(prediction["answers"], list)
 | |
|     assert isinstance(prediction["answers"][0], list)
 | |
|     assert isinstance(prediction["answers"][0][0], Answer)
 | |
|     assert len(prediction["answers"]) == 1  # Predictions for 1 collection of docs
 | |
| 
 | |
| 
 | |
| def test_table_reader_batch_multiple_queries_single_doc_list(table_reader):
 | |
|     data = {
 | |
|         "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
 | |
|         "age": ["58", "47", "60"],
 | |
|         "number of movies": ["87", "53", "69"],
 | |
|         "date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"],
 | |
|     }
 | |
|     table = pd.DataFrame(data)
 | |
| 
 | |
|     query = "When was Di Caprio born?"
 | |
|     prediction = table_reader.predict_batch(
 | |
|         queries=[query, query], documents=[Document(content=table, content_type="table")]
 | |
|     )
 | |
|     # Expected output: List of lists of lists of answers
 | |
|     assert isinstance(prediction["answers"], list)
 | |
|     assert isinstance(prediction["answers"][0], list)
 | |
|     assert isinstance(prediction["answers"][0][0], list)
 | |
|     assert isinstance(prediction["answers"][0][0][0], Answer)
 | |
|     assert len(prediction["answers"]) == 2  # Predictions for 2 queries
 | |
| 
 | |
| 
 | |
| def test_table_reader_batch_multiple_queries_multiple_doc_lists(table_reader):
 | |
|     data = {
 | |
|         "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
 | |
|         "age": ["58", "47", "60"],
 | |
|         "number of movies": ["87", "53", "69"],
 | |
|         "date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"],
 | |
|     }
 | |
|     table = pd.DataFrame(data)
 | |
| 
 | |
|     query = "When was Di Caprio born?"
 | |
|     prediction = table_reader.predict_batch(
 | |
|         queries=[query, query],
 | |
|         documents=[[Document(content=table, content_type="table")], [Document(content=table, content_type="table")]],
 | |
|     )
 | |
|     # Expected output: List of lists answers
 | |
|     assert isinstance(prediction["answers"], list)
 | |
|     assert isinstance(prediction["answers"][0], list)
 | |
|     assert isinstance(prediction["answers"][0][0], Answer)
 | |
|     assert len(prediction["answers"]) == 2  # Predictions for 2 collections of documents
 | |
| 
 | |
| 
 | |
| def test_table_reader_in_pipeline(table_reader):
 | |
|     pipeline = Pipeline()
 | |
|     pipeline.add_node(table_reader, "TableReader", ["Query"])
 | |
|     data = {
 | |
|         "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
 | |
|         "age": ["58", "47", "60"],
 | |
|         "number of movies": ["87", "53", "69"],
 | |
|         "date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"],
 | |
|     }
 | |
| 
 | |
|     table = pd.DataFrame(data)
 | |
|     query = "When was Di Caprio born?"
 | |
| 
 | |
|     prediction = pipeline.run(query=query, documents=[Document(content=table, content_type="table")])
 | |
| 
 | |
|     assert prediction["answers"][0].answer == "11 november 1974"
 | |
|     assert prediction["answers"][0].offsets_in_context[0].start == 7
 | |
|     assert prediction["answers"][0].offsets_in_context[0].end == 8
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize("table_reader", ["tapas"], indirect=True)
 | |
| def test_table_reader_aggregation(table_reader):
 | |
|     data = {
 | |
|         "Mountain": ["Mount Everest", "K2", "Kangchenjunga", "Lhotse", "Makalu"],
 | |
|         "Height": ["8848m", "8,611 m", "8 586m", "8 516 m", "8,485m"],
 | |
|     }
 | |
|     table = pd.DataFrame(data)
 | |
| 
 | |
|     query = "How tall are all mountains on average?"
 | |
|     prediction = table_reader.predict(query=query, documents=[Document(content=table, content_type="table")])
 | |
|     assert prediction["answers"][0].answer == "8609.2 m"
 | |
|     assert prediction["answers"][0].meta["aggregation_operator"] == "AVERAGE"
 | |
|     assert prediction["answers"][0].meta["answer_cells"] == ["8848m", "8,611 m", "8 586m", "8 516 m", "8,485m"]
 | |
| 
 | |
|     query = "How tall are all mountains together?"
 | |
|     prediction = table_reader.predict(query=query, documents=[Document(content=table, content_type="table")])
 | |
|     assert prediction["answers"][0].answer == "43046.0 m"
 | |
|     assert prediction["answers"][0].meta["aggregation_operator"] == "SUM"
 | |
|     assert prediction["answers"][0].meta["answer_cells"] == ["8848m", "8,611 m", "8 586m", "8 516 m", "8,485m"]
 | |
| 
 | |
| 
 | |
| def test_table_without_rows(caplog, table_reader):
 | |
|     # empty DataFrame
 | |
|     table = pd.DataFrame()
 | |
|     document = Document(content=table, content_type="table", id="no_rows")
 | |
|     with caplog.at_level(logging.WARNING):
 | |
|         predictions = table_reader.predict(query="test", documents=[document])
 | |
|         assert "Skipping document with id 'no_rows'" in caplog.text
 | |
|         assert len(predictions["answers"]) == 0
 | |
| 
 | |
| 
 | |
| def test_text_document(caplog, table_reader):
 | |
|     document = Document(content="text", id="text_doc")
 | |
|     with caplog.at_level(logging.WARNING):
 | |
|         predictions = table_reader.predict(query="test", documents=[document])
 | |
|         assert "Skipping document with id 'text_doc'" in caplog.text
 | |
|         assert len(predictions["answers"]) == 0
 |