2021-10-04 11:21:00 +02:00
import os
2022-02-14 11:43:26 +01:00
from copy import deepcopy
2021-04-07 17:53:32 +02:00
from pathlib import Path
2020-10-16 13:25:31 +02:00
import pytest
from fastapi . testclient import TestClient
2021-10-13 14:23:23 +02:00
2021-10-04 11:21:00 +02:00
from rest_api . application import app
2020-10-16 13:25:31 +02:00
2022-01-26 18:12:55 +01:00
2022-02-03 13:43:18 +01:00
FEEDBACK = {
" id " : " 123 " ,
" query " : " Who made the PDF specification? " ,
" document " : {
" content " : " A sample PDF file \n \n History and standardization \n Format (PDF) Adobe Systems made the PDF specification available free of charge in 1993. In the early years PDF was popular mainly in desktop publishing workflows, and competed with a variety of formats such as DjVu, Envoy, Common Ground Digital Paper, Farallon Replica and even Adobe ' s own PostScript format. PDF was a proprietary format controlled by Adobe until it was released as an open standard on July 1, 2008, and published by the International Organization for Standardization as ISO 32000-1:2008, at which time control of the specification passed to an ISO Committee of volunteer industry experts. In 2008, Adobe published a Public Patent License to ISO 32000-1 granting royalty-free rights for all patents owned by Adobe that are necessary to make, use, sell, and distribute PDF-compliant implementations. PDF 1.7, the sixth edition of the PDF specification that became ISO 32000-1, includes some proprietary technologies defined only by Adobe, such as Adobe XML Forms Architecture (XFA) and JavaScript extension for Acrobat, which are referenced by ISO 32000-1 as normative and indispensable for the full implementation of the ISO 32000-1 specification. These proprietary technologies are not standardized and their specification is published only on Adobes website. Many of them are also not supported by popular third-party implementations of PDF. Column 1 " ,
" content_type " : " text " ,
" score " : None ,
" id " : " fc18c987a8312e72a47fb1524f230bb0 " ,
" meta " : { } ,
" embedding " : None ,
" id_hash_keys " : None ,
} ,
" answer " : {
" answer " : " Adobe Systems " ,
" type " : " extractive " ,
" context " : " A sample PDF file \n \n History and standardization \n Format (PDF) Adobe Systems made the PDF specification available free of charge in 1993. In the early ye " ,
" offsets_in_context " : [ { " start " : 60 , " end " : 73 } ] ,
" offsets_in_document " : [ { " start " : 60 , " end " : 73 } ] ,
" document_id " : " fc18c987a8312e72a47fb1524f230bb0 " ,
" meta " : { } ,
" score " : None ,
} ,
" is_correct_answer " : True ,
" is_correct_document " : True ,
" origin " : " user-feedback " ,
" pipeline_id " : " some-123 " ,
}
2021-10-04 11:21:00 +02:00
2021-11-23 17:13:16 +01:00
def exclude_no_answer ( responses ) :
responses [ " answers " ] = [ response for response in responses [ " answers " ] if response . get ( " answer " , None ) ]
return responses
2022-02-14 11:43:26 +01:00
@pytest.fixture ( )
2021-10-04 11:21:00 +02:00
def client ( ) - > TestClient :
2022-02-03 13:43:18 +01:00
os . environ [ " PIPELINE_YAML_PATH " ] = str (
( Path ( __file__ ) . parent / " samples " / " pipeline " / " test_pipeline.yaml " ) . absolute ( )
)
2021-10-12 10:53:54 +02:00
os . environ [ " INDEXING_PIPELINE_NAME " ] = " indexing_text_pipeline "
client = TestClient ( app )
2022-02-14 11:43:26 +01:00
client . post ( url = " /documents/delete_by_filters " , data = ' { " filters " : {} } ' )
client . delete ( url = " /feedback " )
2021-10-12 10:53:54 +02:00
yield client
2022-02-14 11:43:26 +01:00
2021-10-12 10:53:54 +02:00
client . post ( url = " /documents/delete_by_filters " , data = ' { " filters " : {} } ' )
2022-02-14 11:43:26 +01:00
client . delete ( url = " /feedback " )
2021-10-12 10:53:54 +02:00
2020-10-16 13:25:31 +02:00
2022-02-14 11:43:26 +01:00
@pytest.fixture ( )
2021-10-04 11:21:00 +02:00
def populated_client ( client : TestClient ) - > TestClient :
2021-10-12 10:53:54 +02:00
files_to_upload = [
2022-02-03 13:43:18 +01:00
{ " files " : ( Path ( __file__ ) . parent / " samples " / " pdf " / " sample_pdf_1.pdf " ) . open ( " rb " ) } ,
{ " files " : ( Path ( __file__ ) . parent / " samples " / " pdf " / " sample_pdf_2.pdf " ) . open ( " rb " ) } ,
2021-10-12 10:53:54 +02:00
]
for index , fi in enumerate ( files_to_upload ) :
2022-02-03 13:43:18 +01:00
response = client . post (
url = " /file-upload " , files = fi , data = { " meta " : f ' {{ " meta_key " : " meta_value " , " meta_index " : " { index } " }} ' }
)
2021-10-12 10:53:54 +02:00
assert 200 == response . status_code
2021-10-04 11:21:00 +02:00
2022-02-14 11:43:26 +01:00
yield client
2020-11-09 20:41:53 +01:00
2021-10-12 10:53:54 +02:00
2022-02-14 11:43:26 +01:00
def test_get_documents ( populated_client : TestClient ) :
2021-10-12 10:53:54 +02:00
# Get the documents
2022-02-14 11:43:26 +01:00
response = populated_client . post ( url = " /documents/get_by_filters " , data = ' { " filters " : { " meta_key " : [ " meta_value " ]}} ' )
2021-10-04 11:21:00 +02:00
assert 200 == response . status_code
2021-10-12 10:53:54 +02:00
response_json = response . json ( )
2021-10-13 14:23:23 +02:00
2021-10-12 10:53:54 +02:00
# Make sure the right docs are found
names = [ doc [ " meta " ] [ " name " ] for doc in response_json ]
2022-02-14 11:43:26 +01:00
assert " sample_pdf_1.pdf " in names
assert " sample_pdf_2.pdf " in names
2021-10-12 10:53:54 +02:00
meta_keys = [ doc [ " meta " ] [ " meta_key " ] for doc in response_json ]
2022-02-14 11:43:26 +01:00
assert all ( " meta_value " == meta_key for meta_key in meta_keys )
2021-10-12 10:53:54 +02:00
2022-02-14 11:43:26 +01:00
def test_delete_documents ( populated_client : TestClient ) :
# Check how many docs there are
response = populated_client . post ( url = " /documents/get_by_filters " , data = ' { " filters " : { " meta_key " : [ " meta_value " ]}} ' )
assert 200 == response . status_code
response_json = response . json ( )
initial_docs = len ( response_json )
2021-10-04 11:21:00 +02:00
2022-02-14 11:43:26 +01:00
# Check how many docs we will delete
response = populated_client . post ( url = " /documents/get_by_filters " , data = ' { " filters " : { " meta_index " : [ " 0 " ]}} ' )
2021-10-12 10:53:54 +02:00
assert 200 == response . status_code
response_json = response . json ( )
2022-02-14 11:43:26 +01:00
docs_to_delete = len ( response_json )
2021-10-12 10:53:54 +02:00
2021-10-13 14:23:23 +02:00
# Delete one doc
2022-02-14 11:43:26 +01:00
response = populated_client . post ( url = " /documents/delete_by_filters " , data = ' { " filters " : { " meta_index " : [ " 0 " ]}} ' )
2021-10-12 10:53:54 +02:00
assert 200 == response . status_code
2022-02-14 11:43:26 +01:00
# Now there should be less document
response = populated_client . post ( url = " /documents/get_by_filters " , data = ' { " filters " : { " meta_key " : [ " meta_value " ]}} ' )
2021-10-12 10:53:54 +02:00
assert 200 == response . status_code
response_json = response . json ( )
2022-02-14 11:43:26 +01:00
assert len ( response_json ) == initial_docs - docs_to_delete
2021-10-13 14:23:23 +02:00
2022-02-14 11:43:26 +01:00
# Make sure the right docs were deleted
response = populated_client . post ( url = " /documents/get_by_filters " , data = ' { " filters " : { " meta_index " : [ " 0 " ]}} ' )
2021-10-12 10:53:54 +02:00
assert 200 == response . status_code
response_json = response . json ( )
assert len ( response_json ) == 0
2022-02-14 11:43:26 +01:00
response = populated_client . post ( url = " /documents/get_by_filters " , data = ' { " filters " : { " meta_index " : [ " 1 " ]}} ' )
2021-10-12 10:53:54 +02:00
assert 200 == response . status_code
response_json = response . json ( )
2022-02-14 11:43:26 +01:00
assert len ( response_json ) > = 1
2022-02-03 13:43:18 +01:00
2021-10-19 15:22:44 +02:00
2021-10-12 10:53:54 +02:00
def test_file_upload ( client : TestClient ) :
2022-02-03 13:43:18 +01:00
file_to_upload = { " files " : ( Path ( __file__ ) . parent / " samples " / " pdf " / " sample_pdf_1.pdf " ) . open ( " rb " ) }
response = client . post (
url = " /file-upload " ,
files = file_to_upload ,
data = { " meta " : ' { " meta_key " : " meta_value " , " non-existing-field " : " wrong-value " } ' } ,
)
2020-11-09 20:41:53 +01:00
assert 200 == response . status_code
2021-10-13 14:23:23 +02:00
2021-10-04 11:21:00 +02:00
def test_query_with_no_filter ( populated_client : TestClient ) :
2021-04-07 17:53:32 +02:00
query_with_no_filter_value = { " query " : " Who made the PDF specification? " }
2021-10-04 11:21:00 +02:00
response = populated_client . post ( url = " /query " , json = query_with_no_filter_value )
2020-11-09 20:41:53 +01:00
assert 200 == response . status_code
response_json = response . json ( )
2021-11-23 17:13:16 +01:00
response_json = exclude_no_answer ( response_json )
2021-04-07 17:53:32 +02:00
assert response_json [ " answers " ] [ 0 ] [ " answer " ] == " Adobe Systems "
2020-11-09 20:41:53 +01:00
2021-10-13 14:23:23 +02:00
2021-10-04 11:21:00 +02:00
def test_query_with_one_filter ( populated_client : TestClient ) :
2022-02-03 13:43:18 +01:00
query_with_filter = {
" query " : " Who made the PDF specification? " ,
" params " : { " Retriever " : { " filters " : { " meta_key " : " meta_value " } } } ,
}
2021-10-04 11:21:00 +02:00
response = populated_client . post ( url = " /query " , json = query_with_filter )
2020-11-09 20:41:53 +01:00
assert 200 == response . status_code
response_json = response . json ( )
2021-11-23 17:13:16 +01:00
response_json = exclude_no_answer ( response_json )
2021-04-07 17:53:32 +02:00
assert response_json [ " answers " ] [ 0 ] [ " answer " ] == " Adobe Systems "
2020-11-09 20:41:53 +01:00
2021-10-13 14:23:23 +02:00
2021-11-18 18:13:03 +01:00
def test_query_with_one_global_filter ( populated_client : TestClient ) :
query_with_filter = { " query " : " Who made the PDF specification? " , " params " : { " filters " : { " meta_key " : " meta_value " } } }
response = populated_client . post ( url = " /query " , json = query_with_filter )
assert 200 == response . status_code
response_json = response . json ( )
2021-11-23 17:13:16 +01:00
response_json = exclude_no_answer ( response_json )
2021-11-18 18:13:03 +01:00
assert response_json [ " answers " ] [ 0 ] [ " answer " ] == " Adobe Systems "
2021-10-04 11:21:00 +02:00
def test_query_with_filter_list ( populated_client : TestClient ) :
2021-09-10 11:41:16 +02:00
query_with_filter_list = {
" query " : " Who made the PDF specification? " ,
2022-02-03 13:43:18 +01:00
" params " : { " Retriever " : { " filters " : { " meta_key " : [ " meta_value " , " another_value " ] } } } ,
2021-09-10 11:41:16 +02:00
}
2021-10-04 11:21:00 +02:00
response = populated_client . post ( url = " /query " , json = query_with_filter_list )
2020-11-09 20:41:53 +01:00
assert 200 == response . status_code
response_json = response . json ( )
2021-11-23 17:13:16 +01:00
response_json = exclude_no_answer ( response_json )
2021-04-07 17:53:32 +02:00
assert response_json [ " answers " ] [ 0 ] [ " answer " ] == " Adobe Systems "
2020-10-16 13:25:31 +02:00
2021-10-13 14:23:23 +02:00
2021-10-04 11:21:00 +02:00
def test_query_with_invalid_filter ( populated_client : TestClient ) :
2021-09-10 11:41:16 +02:00
query_with_invalid_filter = {
2022-02-03 13:43:18 +01:00
" query " : " Who made the PDF specification? " ,
" params " : { " Retriever " : { " filters " : { " meta_key " : " invalid_value " } } } ,
2021-09-10 11:41:16 +02:00
}
2021-10-04 11:21:00 +02:00
response = populated_client . post ( url = " /query " , json = query_with_invalid_filter )
2020-10-16 13:25:31 +02:00
assert 200 == response . status_code
response_json = response . json ( )
2021-11-23 17:13:16 +01:00
response_json = exclude_no_answer ( response_json )
2021-04-07 17:53:32 +02:00
assert len ( response_json [ " answers " ] ) == 0
2021-10-04 11:21:00 +02:00
2021-10-13 14:23:23 +02:00
def test_write_feedback ( populated_client : TestClient ) :
response = populated_client . post ( url = " /feedback " , json = FEEDBACK )
2021-04-07 17:53:32 +02:00
assert 200 == response . status_code
2020-10-16 13:25:31 +02:00
2021-10-13 14:23:23 +02:00
2022-02-14 11:43:26 +01:00
def test_write_feedback_without_id ( populated_client : TestClient ) :
feedback = deepcopy ( FEEDBACK )
del feedback [ " id " ]
response = populated_client . post ( url = " /feedback " , json = feedback )
assert 200 == response . status_code
2021-10-13 14:23:23 +02:00
def test_get_feedback ( client : TestClient ) :
response = client . post ( url = " /feedback " , json = FEEDBACK )
2021-11-29 17:03:54 +01:00
assert response . status_code == 200
2022-02-14 11:43:26 +01:00
2021-11-29 17:03:54 +01:00
response = client . get ( url = " /feedback " )
assert response . status_code == 200
json_response = response . json ( )
for response_item , expected_item in [ ( json_response [ 0 ] [ key ] , value ) for key , value in FEEDBACK . items ( ) ] :
assert response_item == expected_item
2021-10-13 14:23:23 +02:00
2022-02-14 11:43:26 +01:00
def test_delete_feedback ( client : TestClient ) :
client . post ( url = " /feedback " , json = FEEDBACK )
feedback = deepcopy ( FEEDBACK )
feedback [ " id " ] = 456
feedback [ " origin " ] = " gold-label "
print ( feedback )
client . post ( url = " /feedback " , json = feedback )
response = client . get ( url = " /feedback " )
json_response = response . json ( )
assert len ( json_response ) == 2
response = client . delete ( url = " /feedback " )
assert 200 == response . status_code
response = client . get ( url = " /feedback " )
json_response = response . json ( )
assert len ( json_response ) == 1
2021-11-29 17:03:54 +01:00
def test_export_feedback ( client : TestClient ) :
response = client . post ( url = " /feedback " , json = FEEDBACK )
2021-10-13 14:23:23 +02:00
assert 200 == response . status_code
2021-10-04 11:21:00 +02:00
2021-04-07 17:53:32 +02:00
feedback_urls = [
" /export-feedback?full_document_context=true " ,
" /export-feedback?full_document_context=false&context_size=50 " ,
" /export-feedback?full_document_context=false&context_size=50000 " ,
]
for url in feedback_urls :
2021-11-29 17:03:54 +01:00
response = client . get ( url = url , json = FEEDBACK )
2021-04-07 17:53:32 +02:00
response_json = response . json ( )
context = response_json [ " data " ] [ 0 ] [ " paragraphs " ] [ 0 ] [ " context " ]
answer_start = response_json [ " data " ] [ 0 ] [ " paragraphs " ] [ 0 ] [ " qas " ] [ 0 ] [ " answers " ] [ 0 ] [ " answer_start " ]
answer = response_json [ " data " ] [ 0 ] [ " paragraphs " ] [ 0 ] [ " qas " ] [ 0 ] [ " answers " ] [ 0 ] [ " text " ]
2022-02-03 13:43:18 +01:00
assert context [ answer_start : answer_start + len ( answer ) ] == answer
2021-10-13 14:23:23 +02:00
2021-11-29 17:03:54 +01:00
def test_get_feedback_malformed_query ( client : TestClient ) :
2022-02-14 11:43:26 +01:00
feedback = deepcopy ( FEEDBACK )
2021-11-29 17:03:54 +01:00
feedback [ " unexpected_field " ] = " misplaced-value "
response = client . post ( url = " /feedback " , json = feedback )
assert response . status_code == 422