mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-15 21:10:46 +00:00
86 lines
2.7 KiB
Python
86 lines
2.7 KiB
Python
from abc import ABC, abstractmethod
|
|
from collections import OrderedDict, namedtuple
|
|
import logging
|
|
import pandas as pd
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
from farm_haystack.database import db
|
|
from farm_haystack.database.orm import Document
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# TODO make Paragraph generic for configurable units of text eg, pages, paragraphs, or split by a char_limit
|
|
Paragraph = namedtuple("Paragraph", ["paragraph_id", "document_id", "text"])
|
|
|
|
|
|
class BaseRetriever(ABC):
|
|
@abstractmethod
|
|
def _get_all_paragraphs(self):
|
|
pass
|
|
|
|
@abstractmethod
|
|
def retrieve(self, query, candidate_doc_ids=None, top_k=1):
|
|
pass
|
|
|
|
@abstractmethod
|
|
def fit(self):
|
|
pass
|
|
|
|
|
|
class TfidfRetriever(BaseRetriever):
|
|
"""
|
|
Read all documents from a SQL backend.
|
|
|
|
Split documents into smaller units (eg, paragraphs or pages) to reduce the
|
|
computations when text is passed on to a Reader for QA.
|
|
|
|
It uses sklearn TfidfVectorizer to compute a tf-idf matrix.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.vectorizer = TfidfVectorizer(
|
|
lowercase=True,
|
|
stop_words=None,
|
|
token_pattern=r"(?u)\b\w\w+\b",
|
|
ngram_range=(1, 1),
|
|
)
|
|
|
|
self.paragraphs = self._get_all_paragraphs()
|
|
self.df = None
|
|
self.fit()
|
|
|
|
def _get_all_paragraphs(self):
|
|
"""
|
|
Split the list of documents in paragraphs
|
|
"""
|
|
documents = db.session.query(Document).all()
|
|
|
|
paragraphs = []
|
|
p_id = 0
|
|
for doc in documents:
|
|
_pgs = [d for d in doc.text.splitlines() if d.strip()]
|
|
for p in doc.text.split("\n\n"):
|
|
if not p.strip(): # skip empty paragraphs
|
|
continue
|
|
paragraphs.append(
|
|
Paragraph(document_id=doc.id, paragraph_id=p_id, text=(p,))
|
|
)
|
|
p_id += 1
|
|
logger.info(f"Found {len(paragraphs)} candidate paragraphs from {len(documents)} docs in DB")
|
|
return paragraphs
|
|
|
|
def retrieve(self, query, candidate_doc_ids=None, top_k=10):
|
|
question_vector = self.vectorizer.transform([query])
|
|
|
|
scores = self.tfidf_matrix.dot(question_vector.T).toarray()
|
|
idx_scores = [(idx, score) for idx, score in enumerate(scores)]
|
|
top_k_scores = OrderedDict(
|
|
sorted(idx_scores, key=(lambda tup: tup[1]), reverse=True)[:top_k]
|
|
)
|
|
return top_k_scores
|
|
|
|
def fit(self):
|
|
self.df = pd.DataFrame.from_dict(self.paragraphs)
|
|
self.df["text"] = self.df["text"].apply(lambda x: " ".join(x))
|
|
self.tfidf_matrix = self.vectorizer.fit_transform(self.df["text"])
|