mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-07 12:37:27 +00:00
Add API endpoint to export accuracy metrics from user feedback + created_at timestamp (#803)
* WIP feedback metrics * fix filters and zero division * add created_at and model_name fields to labels * add created_at value * remove debug log level * fix attribute init * move timestamp creation down to docstore / db level * fix import
This commit is contained in:
parent
03cda26d85
commit
6798192d40
@ -188,12 +188,15 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
"answer": {"type": "text"},
|
||||
"is_correct_answer": {"type": "boolean"},
|
||||
"is_correct_document": {"type": "boolean"},
|
||||
"origin": {"type": "keyword"},
|
||||
"origin": {"type": "keyword"}, # e.g. user-feedback or gold-label
|
||||
"document_id": {"type": "keyword"},
|
||||
"offset_start_in_doc": {"type": "long"},
|
||||
"no_answer": {"type": "boolean"},
|
||||
"model_id": {"type": "keyword"},
|
||||
"type": {"type": "keyword"},
|
||||
"created_at": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"},
|
||||
"updated_at": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"}
|
||||
#TODO add pipeline_hash and pipeline_name once we migrated the REST API to pipelines
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -364,6 +367,12 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
else:
|
||||
label = l
|
||||
|
||||
# create timestamps if not available yet
|
||||
if not label.created_at:
|
||||
label.created_at = time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
if not label.updated_at:
|
||||
label.updated_at = label.created_at
|
||||
|
||||
_label = {
|
||||
"_op_type": "index" if self.update_existing_documents else "create",
|
||||
"_index": index,
|
||||
|
||||
@ -3,6 +3,7 @@ from collections import defaultdict
|
||||
from copy import deepcopy
|
||||
from typing import Dict, List, Optional, Union, Generator
|
||||
from uuid import uuid4
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
from scipy.spatial.distance import cosine
|
||||
@ -87,6 +88,11 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
|
||||
for label in label_objects:
|
||||
label_id = str(uuid4())
|
||||
# create timestamps if not available yet
|
||||
if not label.created_at:
|
||||
label.created_at = time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
if not label.updated_at:
|
||||
label.updated_at = label.created_at
|
||||
self.indexes[index][label_id] = label
|
||||
|
||||
def get_document_by_id(self, id: str, index: Optional[str] = None) -> Optional[Document]:
|
||||
|
||||
@ -22,8 +22,8 @@ class ORMBase(Base):
|
||||
__abstract__ = True
|
||||
|
||||
id = Column(String(100), default=lambda: str(uuid4()), primary_key=True)
|
||||
created = Column(DateTime, server_default=func.now())
|
||||
updated = Column(DateTime, server_default=func.now(), server_onupdate=func.now())
|
||||
created_at = Column(DateTime, server_default=func.now())
|
||||
updated_at = Column(DateTime, server_default=func.now(), server_onupdate=func.now())
|
||||
|
||||
|
||||
class DocumentORM(ORMBase):
|
||||
@ -424,6 +424,8 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
answer=row.answer,
|
||||
offset_start_in_doc=row.offset_start_in_doc,
|
||||
model_id=row.model_id,
|
||||
created_at=row.created_at,
|
||||
updated_at=row.updated_at
|
||||
)
|
||||
return label
|
||||
|
||||
|
||||
@ -1,7 +1,5 @@
|
||||
from abc import abstractmethod
|
||||
from typing import Any, Optional, Dict, List
|
||||
from uuid import uuid4
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
@ -78,6 +76,7 @@ class Document:
|
||||
def __str__(self):
|
||||
return str(self.to_dict())
|
||||
|
||||
|
||||
class Label:
|
||||
def __init__(self, question: str,
|
||||
answer: str,
|
||||
@ -88,7 +87,9 @@ class Label:
|
||||
document_id: Optional[str] = None,
|
||||
offset_start_in_doc: Optional[int] = None,
|
||||
no_answer: Optional[bool] = None,
|
||||
model_id: Optional[int] = None):
|
||||
model_id: Optional[int] = None,
|
||||
created_at: Optional[str] = None,
|
||||
updated_at: Optional[str] = None):
|
||||
"""
|
||||
Object used to represent label/feedback in a standardized way within Haystack.
|
||||
This includes labels from dataset like SQuAD, annotations from labeling tools,
|
||||
@ -106,6 +107,10 @@ class Label:
|
||||
:param offset_start_in_doc: the answer start offset in the document.
|
||||
:param no_answer: whether the question in unanswerable.
|
||||
:param model_id: model_id used for prediction (in-case of user feedback).
|
||||
:param created_at: Timestamp of creation with format yyyy-MM-dd HH:mm:ss.
|
||||
Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S").
|
||||
:param created_at: Timestamp of update with format yyyy-MM-dd HH:mm:ss.
|
||||
Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
"""
|
||||
|
||||
# Create a unique ID (either new one, or one from user input)
|
||||
@ -114,6 +119,8 @@ class Label:
|
||||
else:
|
||||
self.id = str(uuid4())
|
||||
|
||||
self.created_at = created_at
|
||||
self.updated_at = updated_at
|
||||
self.question = question
|
||||
self.answer = answer
|
||||
self.is_correct_answer = is_correct_answer
|
||||
@ -142,7 +149,9 @@ class Label:
|
||||
getattr(other, 'document_id', None) == self.document_id and
|
||||
getattr(other, 'offset_start_in_doc', None) == self.offset_start_in_doc and
|
||||
getattr(other, 'no_answer', None) == self.no_answer and
|
||||
getattr(other, 'model_id', None) == self.model_id)
|
||||
getattr(other, 'model_id', None) == self.model_id and
|
||||
getattr(other, 'created_at', None) == self.created_at and
|
||||
getattr(other, 'updated_at', None) == self.updated_at)
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.question +
|
||||
@ -153,7 +162,8 @@ class Label:
|
||||
str(self.document_id) +
|
||||
str(self.offset_start_in_doc) +
|
||||
str(self.no_answer) +
|
||||
str(self.model_id))
|
||||
str(self.model_id)
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return str(self.to_dict())
|
||||
|
||||
@ -1,7 +1,9 @@
|
||||
from typing import Optional
|
||||
import time
|
||||
|
||||
from fastapi import APIRouter
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Dict, Union, List
|
||||
|
||||
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
|
||||
from rest_api.config import (
|
||||
@ -65,6 +67,8 @@ class DocQAFeedback(FAQQAFeedback):
|
||||
..., description="The answer start offset in the original doc. Only required for doc-qa feedback."
|
||||
)
|
||||
|
||||
class FilterRequest(BaseModel):
|
||||
filters: Optional[Dict[str, Optional[Union[str, List[str]]]]] = None
|
||||
|
||||
@router.post("/doc-qa-feedback")
|
||||
def doc_qa_feedback(feedback: DocQAFeedback):
|
||||
@ -77,6 +81,48 @@ def faq_qa_feedback(feedback: FAQQAFeedback):
|
||||
document_store.write_labels([{"origin": "user-feedback-faq", **feedback_payload}])
|
||||
|
||||
|
||||
@router.post("/eval-doc-qa-feedback")
|
||||
def eval_doc_qa_feedback(filters: FilterRequest = None):
|
||||
"""
|
||||
Return basic accuracy metrics based on the user feedback.
|
||||
Which ratio of answers was correct? Which ratio of documents was correct?
|
||||
You can supply filters in the request to only use a certain subset of labels.
|
||||
|
||||
**Example:**
|
||||
|
||||
```
|
||||
| curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' \
|
||||
| --header 'Content-Type: application/json' \
|
||||
| --data-raw '{ "filters": {"document_id": ["XRR3xnEBCYVTkbTystOB"]} }'
|
||||
"""
|
||||
|
||||
if filters:
|
||||
filters = filters.filters
|
||||
filters["origin"] = ["user-feedback"]
|
||||
else:
|
||||
filters = {"origin": ["user-feedback"]}
|
||||
|
||||
labels = document_store.get_all_labels(
|
||||
index=DB_INDEX_FEEDBACK,
|
||||
filters=filters
|
||||
)
|
||||
|
||||
if len(labels) > 0:
|
||||
answer_feedback = [1 if l.is_correct_answer else 0 for l in labels]
|
||||
doc_feedback = [1 if l.is_correct_document else 0 for l in labels]
|
||||
|
||||
answer_accuracy = sum(answer_feedback)/len(answer_feedback)
|
||||
doc_accuracy = sum(doc_feedback)/len(doc_feedback)
|
||||
|
||||
res = {"answer_accuracy": answer_accuracy,
|
||||
"document_accuracy": doc_accuracy,
|
||||
"n_feedback": len(labels)}
|
||||
else:
|
||||
res = {"answer_accuracy": None,
|
||||
"document_accuracy": None,
|
||||
"n_feedback": 0}
|
||||
return res
|
||||
|
||||
@router.get("/export-doc-qa-feedback")
|
||||
def export_doc_qa_feedback(context_size: int = 2_000):
|
||||
"""
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user