mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-08 13:06:29 +00:00
* fix pip backtracking issue * restrict azure-core version * Remove the trailing comma * Add skip_magic_trailing_comma in pyproject.toml for pydoc compatibility * Pin pydoc-markdown _again_ Co-authored-by: Sara Zan <sarazanzo94@gmail.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
295 lines
14 KiB
Python
295 lines
14 KiB
Python
from typing import Dict, List, Optional
|
|
|
|
import logging
|
|
import pandas as pd
|
|
from collections import OrderedDict, namedtuple
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
from haystack.schema import Document
|
|
from haystack.document_stores import BaseDocumentStore, KeywordDocumentStore
|
|
from haystack.nodes.retriever import BaseRetriever
|
|
|
|
from haystack.document_stores import BaseDocumentStore
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ElasticsearchRetriever(BaseRetriever):
|
|
def __init__(self, document_store: KeywordDocumentStore, top_k: int = 10, custom_query: str = None):
|
|
"""
|
|
:param document_store: an instance of an ElasticsearchDocumentStore to retrieve documents from.
|
|
:param custom_query: query string as per Elasticsearch DSL with a mandatory query placeholder(query).
|
|
|
|
Optionally, ES `filter` clause can be added where the values of `terms` are placeholders
|
|
that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..)
|
|
names must match with the filters dict supplied in self.retrieve().
|
|
::
|
|
|
|
**An example custom_query:**
|
|
```python
|
|
| {
|
|
| "size": 10,
|
|
| "query": {
|
|
| "bool": {
|
|
| "should": [{"multi_match": {
|
|
| "query": ${query}, // mandatory query placeholder
|
|
| "type": "most_fields",
|
|
| "fields": ["content", "title"]}}],
|
|
| "filter": [ // optional custom filters
|
|
| {"terms": {"year": ${years}}},
|
|
| {"terms": {"quarter": ${quarters}}},
|
|
| {"range": {"date": {"gte": ${date}}}}
|
|
| ],
|
|
| }
|
|
| },
|
|
| }
|
|
```
|
|
|
|
**For this custom_query, a sample retrieve() could be:**
|
|
```python
|
|
| self.retrieve(query="Why did the revenue increase?",
|
|
| filters={"years": ["2019"], "quarters": ["Q1", "Q2"]})
|
|
```
|
|
|
|
Optionally, highlighting can be defined by specifying Elasticsearch's highlight settings.
|
|
See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html.
|
|
You will find the highlighted output in the returned Document's meta field by key "highlighted".
|
|
::
|
|
|
|
**Example custom_query with highlighting:**
|
|
```python
|
|
| {
|
|
| "size": 10,
|
|
| "query": {
|
|
| "bool": {
|
|
| "should": [{"multi_match": {
|
|
| "query": ${query}, // mandatory query placeholder
|
|
| "type": "most_fields",
|
|
| "fields": ["content", "title"]}}],
|
|
| }
|
|
| },
|
|
| "highlight": { // enable highlighting
|
|
| "fields": { // for fields content and title
|
|
| "content": {},
|
|
| "title": {}
|
|
| }
|
|
| },
|
|
| }
|
|
```
|
|
|
|
**For this custom_query, highlighting info can be accessed by:**
|
|
```python
|
|
| docs = self.retrieve(query="Why did the revenue increase?")
|
|
| highlighted_content = docs[0].meta["highlighted"]["content"]
|
|
| highlighted_title = docs[0].meta["highlighted"]["title"]
|
|
```
|
|
|
|
:param top_k: How many documents to return per query.
|
|
"""
|
|
# save init parameters to enable export of component config as YAML
|
|
self.set_config(document_store=document_store, top_k=top_k, custom_query=custom_query)
|
|
self.document_store: KeywordDocumentStore = document_store
|
|
self.top_k = top_k
|
|
self.custom_query = custom_query
|
|
|
|
def retrieve(
|
|
self,
|
|
query: str,
|
|
filters: dict = None,
|
|
top_k: Optional[int] = None,
|
|
index: str = None,
|
|
headers: Optional[Dict[str, str]] = None,
|
|
) -> List[Document]:
|
|
"""
|
|
Scan through documents in DocumentStore and return a small number documents
|
|
that are most relevant to the query.
|
|
|
|
:param query: The query
|
|
:param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
|
|
:param top_k: How many documents to return per query.
|
|
:param index: The name of the index in the DocumentStore from which to retrieve documents
|
|
:param headers: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
|
|
Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
|
|
"""
|
|
if top_k is None:
|
|
top_k = self.top_k
|
|
if index is None:
|
|
index = self.document_store.index
|
|
|
|
documents = self.document_store.query(query, filters, top_k, self.custom_query, index, headers=headers)
|
|
return documents
|
|
|
|
|
|
class ElasticsearchFilterOnlyRetriever(ElasticsearchRetriever):
|
|
"""
|
|
Naive "Retriever" that returns all documents that match the given filters. No impact of query at all.
|
|
Helpful for benchmarking, testing and if you want to do QA on small documents without an "active" retriever.
|
|
"""
|
|
|
|
def retrieve(
|
|
self,
|
|
query: str,
|
|
filters: dict = None,
|
|
top_k: Optional[int] = None,
|
|
index: str = None,
|
|
headers: Optional[Dict[str, str]] = None,
|
|
) -> List[Document]:
|
|
"""
|
|
Scan through documents in DocumentStore and return a small number documents
|
|
that are most relevant to the query.
|
|
|
|
:param query: The query
|
|
:param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
|
|
:param top_k: How many documents to return per query.
|
|
:param index: The name of the index in the DocumentStore from which to retrieve documents
|
|
:param headers: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
|
|
Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
|
|
"""
|
|
if top_k is None:
|
|
top_k = self.top_k
|
|
if index is None:
|
|
index = self.document_store.index
|
|
documents = self.document_store.query(
|
|
query=None, filters=filters, top_k=top_k, custom_query=self.custom_query, index=index, headers=headers
|
|
)
|
|
return documents
|
|
|
|
|
|
# TODO make Paragraph generic for configurable units of text eg, pages, paragraphs, or split by a char_limit
|
|
Paragraph = namedtuple("Paragraph", ["paragraph_id", "document_id", "content", "meta"])
|
|
|
|
|
|
class TfidfRetriever(BaseRetriever):
|
|
"""
|
|
Read all documents from a SQL backend.
|
|
|
|
Split documents into smaller units (eg, paragraphs or pages) to reduce the
|
|
computations when text is passed on to a Reader for QA.
|
|
|
|
It uses sklearn's TfidfVectorizer to compute a tf-idf matrix.
|
|
"""
|
|
|
|
def __init__(self, document_store: BaseDocumentStore, top_k: int = 10, auto_fit=True):
|
|
"""
|
|
:param document_store: an instance of a DocumentStore to retrieve documents from.
|
|
:param top_k: How many documents to return per query.
|
|
:param auto_fit: Whether to automatically update tf-idf matrix by calling fit() after new documents have been added
|
|
"""
|
|
# save init parameters to enable export of component config as YAML
|
|
self.set_config(document_store=document_store, top_k=top_k, auto_fit=auto_fit)
|
|
|
|
self.vectorizer = TfidfVectorizer(
|
|
lowercase=True, stop_words=None, token_pattern=r"(?u)\b\w\w+\b", ngram_range=(1, 1)
|
|
)
|
|
|
|
self.document_store = document_store
|
|
self.paragraphs = self._get_all_paragraphs()
|
|
self.df = None
|
|
self.top_k = top_k
|
|
self.auto_fit = auto_fit
|
|
self.document_count = 0
|
|
self.fit()
|
|
|
|
def _get_all_paragraphs(self) -> List[Paragraph]:
|
|
"""
|
|
Split the list of documents in paragraphs
|
|
"""
|
|
documents = self.document_store.get_all_documents()
|
|
|
|
paragraphs = []
|
|
p_id = 0
|
|
for doc in documents:
|
|
for p in doc.content.split(
|
|
"\n\n"
|
|
): # TODO: this assumes paragraphs are separated by "\n\n". Can be switched to paragraph tokenizer.
|
|
if not p.strip(): # skip empty paragraphs
|
|
continue
|
|
paragraphs.append(Paragraph(document_id=doc.id, paragraph_id=p_id, content=(p,), meta=doc.meta))
|
|
p_id += 1
|
|
logger.info(f"Found {len(paragraphs)} candidate paragraphs from {len(documents)} docs in DB")
|
|
return paragraphs
|
|
|
|
def _calc_scores(self, query: str) -> dict:
|
|
question_vector = self.vectorizer.transform([query])
|
|
|
|
scores = self.tfidf_matrix.dot(question_vector.T).toarray()
|
|
idx_scores = [(idx, score) for idx, score in enumerate(scores)]
|
|
indices_and_scores = OrderedDict(sorted(idx_scores, key=(lambda tup: tup[1]), reverse=True))
|
|
return indices_and_scores
|
|
|
|
def retrieve(
|
|
self,
|
|
query: str,
|
|
filters: dict = None,
|
|
top_k: Optional[int] = None,
|
|
index: str = None,
|
|
headers: Optional[Dict[str, str]] = None,
|
|
) -> List[Document]:
|
|
"""
|
|
Scan through documents in DocumentStore and return a small number documents
|
|
that are most relevant to the query.
|
|
|
|
:param query: The query
|
|
:param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
|
|
:param top_k: How many documents to return per query.
|
|
:param index: The name of the index in the DocumentStore from which to retrieve documents
|
|
"""
|
|
if self.auto_fit:
|
|
if self.document_store.get_document_count(headers=headers) != self.document_count:
|
|
# run fit() to update self.df, self.tfidf_matrix and self.document_count
|
|
logger.warning(
|
|
"Indexed documents have been updated and fit() method needs to be run before retrieval. Running it now."
|
|
)
|
|
self.fit()
|
|
if self.df is None:
|
|
raise Exception(
|
|
"Retrieval requires dataframe df and tf-idf matrix but fit() did not calculate them probably due to an empty document store."
|
|
)
|
|
|
|
if filters:
|
|
raise NotImplementedError("Filters are not implemented in TfidfRetriever.")
|
|
if index:
|
|
raise NotImplementedError("Switching index is not supported in TfidfRetriever.")
|
|
|
|
if top_k is None:
|
|
top_k = self.top_k
|
|
# get scores
|
|
indices_and_scores = self._calc_scores(query)
|
|
|
|
# rank paragraphs
|
|
df_sliced = self.df.loc[indices_and_scores.keys()]
|
|
df_sliced = df_sliced[:top_k]
|
|
|
|
logger.debug(
|
|
f"Identified {df_sliced.shape[0]} candidates via retriever:\n {df_sliced.to_string(col_space=10, index=False)}"
|
|
)
|
|
|
|
# get actual content for the top candidates
|
|
paragraphs = list(df_sliced.content.values)
|
|
meta_data = [
|
|
{"document_id": row["document_id"], "paragraph_id": row["paragraph_id"], "meta": row.get("meta", {})}
|
|
for idx, row in df_sliced.iterrows()
|
|
]
|
|
|
|
documents = []
|
|
for para, meta in zip(paragraphs, meta_data):
|
|
documents.append(Document(id=meta["document_id"], content=para, meta=meta.get("meta", {})))
|
|
|
|
return documents
|
|
|
|
def fit(self):
|
|
"""
|
|
Performing training on this class according to the TF-IDF algorithm.
|
|
"""
|
|
if not self.paragraphs or len(self.paragraphs) == 0:
|
|
self.paragraphs = self._get_all_paragraphs()
|
|
if not self.paragraphs or len(self.paragraphs) == 0:
|
|
logger.warning("Fit method called with empty document store")
|
|
return
|
|
|
|
self.df = pd.DataFrame.from_dict(self.paragraphs)
|
|
self.df["content"] = self.df["content"].apply(lambda x: " ".join(x)) # pylint: disable=unnecessary-lambda
|
|
self.tfidf_matrix = self.vectorizer.fit_transform(self.df["content"])
|
|
self.document_count = self.document_store.get_document_count()
|