Add API endpoint to export accuracy metrics from user feedback + created_at timestamp (#803)

* WIP feedback metrics

* fix filters and zero division

* add created_at and model_name fields to labels

* add created_at value

* remove debug log level

* fix attribute init

* move timestamp creation down to docstore / db level

* fix import
This commit is contained in:
Malte Pietsch 2021-02-15 10:48:59 +01:00 committed by GitHub
parent 03cda26d85
commit 6798192d40
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 81 additions and 8 deletions

View File

@ -188,12 +188,15 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
"answer": {"type": "text"},
"is_correct_answer": {"type": "boolean"},
"is_correct_document": {"type": "boolean"},
"origin": {"type": "keyword"},
"origin": {"type": "keyword"}, # e.g. user-feedback or gold-label
"document_id": {"type": "keyword"},
"offset_start_in_doc": {"type": "long"},
"no_answer": {"type": "boolean"},
"model_id": {"type": "keyword"},
"type": {"type": "keyword"},
"created_at": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"},
"updated_at": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"}
#TODO add pipeline_hash and pipeline_name once we migrated the REST API to pipelines
}
}
}
@ -364,6 +367,12 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
else:
label = l
# create timestamps if not available yet
if not label.created_at:
label.created_at = time.strftime("%Y-%m-%d %H:%M:%S")
if not label.updated_at:
label.updated_at = label.created_at
_label = {
"_op_type": "index" if self.update_existing_documents else "create",
"_index": index,

View File

@ -3,6 +3,7 @@ from collections import defaultdict
from copy import deepcopy
from typing import Dict, List, Optional, Union, Generator
from uuid import uuid4
import time
import numpy as np
from scipy.spatial.distance import cosine
@ -87,6 +88,11 @@ class InMemoryDocumentStore(BaseDocumentStore):
for label in label_objects:
label_id = str(uuid4())
# create timestamps if not available yet
if not label.created_at:
label.created_at = time.strftime("%Y-%m-%d %H:%M:%S")
if not label.updated_at:
label.updated_at = label.created_at
self.indexes[index][label_id] = label
def get_document_by_id(self, id: str, index: Optional[str] = None) -> Optional[Document]:

View File

@ -22,8 +22,8 @@ class ORMBase(Base):
__abstract__ = True
id = Column(String(100), default=lambda: str(uuid4()), primary_key=True)
created = Column(DateTime, server_default=func.now())
updated = Column(DateTime, server_default=func.now(), server_onupdate=func.now())
created_at = Column(DateTime, server_default=func.now())
updated_at = Column(DateTime, server_default=func.now(), server_onupdate=func.now())
class DocumentORM(ORMBase):
@ -424,6 +424,8 @@ class SQLDocumentStore(BaseDocumentStore):
answer=row.answer,
offset_start_in_doc=row.offset_start_in_doc,
model_id=row.model_id,
created_at=row.created_at,
updated_at=row.updated_at
)
return label

View File

@ -1,7 +1,5 @@
from abc import abstractmethod
from typing import Any, Optional, Dict, List
from uuid import uuid4
import numpy as np
@ -78,6 +76,7 @@ class Document:
def __str__(self):
return str(self.to_dict())
class Label:
def __init__(self, question: str,
answer: str,
@ -88,7 +87,9 @@ class Label:
document_id: Optional[str] = None,
offset_start_in_doc: Optional[int] = None,
no_answer: Optional[bool] = None,
model_id: Optional[int] = None):
model_id: Optional[int] = None,
created_at: Optional[str] = None,
updated_at: Optional[str] = None):
"""
Object used to represent label/feedback in a standardized way within Haystack.
This includes labels from dataset like SQuAD, annotations from labeling tools,
@ -106,6 +107,10 @@ class Label:
:param offset_start_in_doc: the answer start offset in the document.
:param no_answer: whether the question in unanswerable.
:param model_id: model_id used for prediction (in-case of user feedback).
:param created_at: Timestamp of creation with format yyyy-MM-dd HH:mm:ss.
Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S").
:param created_at: Timestamp of update with format yyyy-MM-dd HH:mm:ss.
Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S")
"""
# Create a unique ID (either new one, or one from user input)
@ -114,6 +119,8 @@ class Label:
else:
self.id = str(uuid4())
self.created_at = created_at
self.updated_at = updated_at
self.question = question
self.answer = answer
self.is_correct_answer = is_correct_answer
@ -142,7 +149,9 @@ class Label:
getattr(other, 'document_id', None) == self.document_id and
getattr(other, 'offset_start_in_doc', None) == self.offset_start_in_doc and
getattr(other, 'no_answer', None) == self.no_answer and
getattr(other, 'model_id', None) == self.model_id)
getattr(other, 'model_id', None) == self.model_id and
getattr(other, 'created_at', None) == self.created_at and
getattr(other, 'updated_at', None) == self.updated_at)
def __hash__(self):
return hash(self.question +
@ -153,7 +162,8 @@ class Label:
str(self.document_id) +
str(self.offset_start_in_doc) +
str(self.no_answer) +
str(self.model_id))
str(self.model_id)
)
def __repr__(self):
return str(self.to_dict())

View File

@ -1,7 +1,9 @@
from typing import Optional
import time
from fastapi import APIRouter
from pydantic import BaseModel, Field
from typing import Dict, Union, List
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from rest_api.config import (
@ -65,6 +67,8 @@ class DocQAFeedback(FAQQAFeedback):
..., description="The answer start offset in the original doc. Only required for doc-qa feedback."
)
class FilterRequest(BaseModel):
filters: Optional[Dict[str, Optional[Union[str, List[str]]]]] = None
@router.post("/doc-qa-feedback")
def doc_qa_feedback(feedback: DocQAFeedback):
@ -77,6 +81,48 @@ def faq_qa_feedback(feedback: FAQQAFeedback):
document_store.write_labels([{"origin": "user-feedback-faq", **feedback_payload}])
@router.post("/eval-doc-qa-feedback")
def eval_doc_qa_feedback(filters: FilterRequest = None):
"""
Return basic accuracy metrics based on the user feedback.
Which ratio of answers was correct? Which ratio of documents was correct?
You can supply filters in the request to only use a certain subset of labels.
**Example:**
```
| curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' \
| --header 'Content-Type: application/json' \
| --data-raw '{ "filters": {"document_id": ["XRR3xnEBCYVTkbTystOB"]} }'
"""
if filters:
filters = filters.filters
filters["origin"] = ["user-feedback"]
else:
filters = {"origin": ["user-feedback"]}
labels = document_store.get_all_labels(
index=DB_INDEX_FEEDBACK,
filters=filters
)
if len(labels) > 0:
answer_feedback = [1 if l.is_correct_answer else 0 for l in labels]
doc_feedback = [1 if l.is_correct_document else 0 for l in labels]
answer_accuracy = sum(answer_feedback)/len(answer_feedback)
doc_accuracy = sum(doc_feedback)/len(doc_feedback)
res = {"answer_accuracy": answer_accuracy,
"document_accuracy": doc_accuracy,
"n_feedback": len(labels)}
else:
res = {"answer_accuracy": None,
"document_accuracy": None,
"n_feedback": 0}
return res
@router.get("/export-doc-qa-feedback")
def export_doc_qa_feedback(context_size: int = 2_000):
"""