mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-09-25 08:04:49 +00:00
refactor: replace mutable default arguments (#4070)
* refactor: replace mutable default arguments * change type annotation in BasePreProcessor to Optional[List]
This commit is contained in:
parent
3273a2714d
commit
0e282e5ca4
@ -49,7 +49,7 @@ class InMemoryDocumentStore(KeywordDocumentStore):
|
||||
use_bm25: bool = False,
|
||||
bm25_tokenization_regex: str = r"(?u)\b\w\w+\b",
|
||||
bm25_algorithm: Literal["BM25Okapi", "BM25L", "BM25Plus"] = "BM25Okapi",
|
||||
bm25_parameters: dict = {},
|
||||
bm25_parameters: Optional[Dict] = None,
|
||||
):
|
||||
"""
|
||||
:param index: The documents are scoped to an index attribute that can be used when writing, querying,
|
||||
@ -87,7 +87,10 @@ class InMemoryDocumentStore(KeywordDocumentStore):
|
||||
:param bm25_parameters: Parameters for BM25 implementation in a dictionary format.
|
||||
For example: {'k1':1.5, 'b':0.75, 'epsilon':0.25}
|
||||
You can learn more about these parameters by visiting https://github.com/dorianbrown/rank_bm25
|
||||
By default, no parameters are set.
|
||||
"""
|
||||
if bm25_parameters is None:
|
||||
bm25_parameters = {}
|
||||
super().__init__()
|
||||
|
||||
self.indexes: Dict[str, Dict] = defaultdict(dict)
|
||||
|
@ -68,7 +68,7 @@ class PineconeDocumentStore(BaseDocumentStore):
|
||||
progress_bar: bool = True,
|
||||
duplicate_documents: str = "overwrite",
|
||||
recreate_index: bool = False,
|
||||
metadata_config: dict = {"indexed": []},
|
||||
metadata_config: Optional[Dict] = None,
|
||||
validate_index_sync: bool = True,
|
||||
):
|
||||
"""
|
||||
@ -106,6 +106,8 @@ class PineconeDocumentStore(BaseDocumentStore):
|
||||
Should be in the format `{"indexed": ["metadata-field-1", "metadata-field-2", "metadata-field-n"]}`. By default,
|
||||
no fields are indexed.
|
||||
"""
|
||||
if metadata_config is None:
|
||||
metadata_config = {"indexed": []}
|
||||
# Connect to Pinecone server using python client binding
|
||||
if not api_key:
|
||||
raise PineconeDocumentStoreError(
|
||||
@ -201,12 +203,14 @@ class PineconeDocumentStore(BaseDocumentStore):
|
||||
replicas: Optional[int] = 1,
|
||||
shards: Optional[int] = 1,
|
||||
recreate_index: bool = False,
|
||||
metadata_config: dict = {"indexed": []},
|
||||
metadata_config: Optional[Dict] = None,
|
||||
):
|
||||
"""
|
||||
Create a new index for storing documents in case an
|
||||
index with the name doesn't exist already.
|
||||
"""
|
||||
if metadata_config is None:
|
||||
metadata_config = {"indexed": []}
|
||||
index = self._index_name(index)
|
||||
|
||||
if recreate_index:
|
||||
|
@ -555,7 +555,7 @@ class DataSiloForCrossVal:
|
||||
def make(
|
||||
cls,
|
||||
datasilo: DataSilo,
|
||||
sets: List[str] = ["train", "dev", "test"],
|
||||
sets: Optional[List[str]] = None,
|
||||
n_splits: int = 5,
|
||||
shuffle: bool = True,
|
||||
random_state: Optional[int] = None,
|
||||
@ -568,7 +568,7 @@ class DataSiloForCrossVal:
|
||||
original data silo passed on.
|
||||
|
||||
:param datasilo: The data silo that contains the original data.
|
||||
:param sets: Which sets to use to create the xval folds (strings)
|
||||
:param sets: Which sets to use to create the xval folds (strings). By default, "train", "dev", and "test" are used.
|
||||
:param n_splits: number of folds to create
|
||||
:param shuffle: shuffle each class' samples before splitting
|
||||
:param random_state: random state for shuffling
|
||||
@ -576,6 +576,8 @@ class DataSiloForCrossVal:
|
||||
It is never done with question answering.
|
||||
:param n_neg_answers_per_question: number of negative answers per question to include for training
|
||||
"""
|
||||
if sets is None:
|
||||
sets = ["train", "dev", "test"]
|
||||
if "question_answering" in datasilo.processor.tasks and n_inner_splits is None: # type: ignore
|
||||
return cls._make_question_answering(
|
||||
datasilo, sets, n_splits, shuffle, random_state, n_neg_answers_per_question
|
||||
@ -588,7 +590,7 @@ class DataSiloForCrossVal:
|
||||
def _make_question_answering(
|
||||
cls,
|
||||
datasilo: DataSilo,
|
||||
sets: List[str] = ["train", "dev", "test"],
|
||||
sets: Optional[List[str]] = None,
|
||||
n_splits: int = 5,
|
||||
shuffle: bool = True,
|
||||
random_state: Optional[int] = None,
|
||||
@ -600,12 +602,14 @@ class DataSiloForCrossVal:
|
||||
data for question-answering-
|
||||
|
||||
:param datasilo: The data silo that contains the original data.
|
||||
:param sets: Which sets to use to create the xval folds (strings).
|
||||
:param sets: Which sets to use to create the xval folds (strings). By default, "train", "dev", and "test" are used.
|
||||
:param n_splits: Number of folds to create.
|
||||
:param shuffle: Shuffle each class' samples before splitting.
|
||||
:param random_state: Random state for shuffling.
|
||||
:param n_neg_answers_per_question: Number of negative answers per question to include for training.
|
||||
"""
|
||||
if sets is None:
|
||||
sets = ["train", "dev", "test"]
|
||||
assert "id" in datasilo.tensor_names, f"Expected tensor 'id' in tensor names, found {datasilo.tensor_names}" # type: ignore
|
||||
assert "labels" in datasilo.tensor_names, f"Expected tensor 'labels' in tensor names, found {datasilo.tensor_names}" # type: ignore
|
||||
|
||||
|
@ -59,7 +59,7 @@ class Processor(ABC):
|
||||
test_filename: Optional[Union[Path, str]],
|
||||
dev_split: float,
|
||||
data_dir: Optional[Union[Path, str]],
|
||||
tasks: Dict = {},
|
||||
tasks: Optional[Dict] = None,
|
||||
proxies: Optional[Dict] = None,
|
||||
multithreading_rust: Optional[bool] = True,
|
||||
):
|
||||
@ -82,6 +82,8 @@ class Processor(ABC):
|
||||
Note: Enabling multithreading in Rust AND multiprocessing in python might cause
|
||||
deadlocks.
|
||||
"""
|
||||
if tasks is None:
|
||||
tasks = {}
|
||||
if not multithreading_rust:
|
||||
os.environ["RAYON_RS_NUM_CPUS"] = "1"
|
||||
|
||||
@ -313,7 +315,7 @@ class Processor(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def dataset_from_dicts(
|
||||
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
|
||||
self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False
|
||||
):
|
||||
raise NotImplementedError()
|
||||
|
||||
@ -444,7 +446,7 @@ class SquadProcessor(Processor):
|
||||
)
|
||||
|
||||
def dataset_from_dicts(
|
||||
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
|
||||
self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False
|
||||
):
|
||||
"""
|
||||
Convert input dictionaries into a pytorch dataset for Question Answering.
|
||||
@ -456,6 +458,8 @@ class SquadProcessor(Processor):
|
||||
:param indices: list, indices used during multiprocessing so that IDs assigned to our baskets is unique
|
||||
:param return_baskets: boolean, whether to return the baskets or not (baskets are needed during inference)
|
||||
"""
|
||||
if indices is None:
|
||||
indices = []
|
||||
# Convert to standard format
|
||||
pre_baskets = [self.convert_qa_input_dict(x) for x in dicts] # TODO move to input object conversion
|
||||
|
||||
@ -990,7 +994,7 @@ class TextSimilarityProcessor(Processor):
|
||||
json.dump(config, file)
|
||||
|
||||
def dataset_from_dicts(
|
||||
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
|
||||
self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False
|
||||
):
|
||||
"""
|
||||
Convert input dictionaries into a pytorch dataset for TextSimilarity (e.g. DPR).
|
||||
@ -1013,6 +1017,8 @@ class TextSimilarityProcessor(Processor):
|
||||
:param return_baskets: whether to return the baskets or not (baskets are needed during inference)
|
||||
:return: dataset, tensor_names, problematic_ids, [baskets]
|
||||
"""
|
||||
if indices is None:
|
||||
indices = []
|
||||
# Take the dict and insert into our basket structure, this stages also adds an internal IDs
|
||||
baskets = self._fill_baskets(dicts, indices)
|
||||
|
||||
@ -1254,7 +1260,7 @@ class TableTextSimilarityProcessor(Processor):
|
||||
dev_split: float = 0.1,
|
||||
proxies: Optional[Dict] = None,
|
||||
max_samples: Optional[int] = None,
|
||||
embed_meta_fields: List[str] = ["page_title", "section_title", "caption"],
|
||||
embed_meta_fields: Optional[List[str]] = None,
|
||||
num_positives: int = 1,
|
||||
num_hard_negatives: int = 1,
|
||||
shuffle_negatives: bool = True,
|
||||
@ -1284,7 +1290,7 @@ class TableTextSimilarityProcessor(Processor):
|
||||
:param proxies: Proxy configuration to allow downloads of remote datasets.
|
||||
Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
|
||||
:param max_samples: maximum number of samples to use.
|
||||
:param embed_meta_fields: List of meta fields to embed in text passages and tables during tensorization.
|
||||
:param embed_meta_fields: List of meta fields to embed in text passages and tables during tensorization. By default, "page_title", "section_title", and "caption" are used.
|
||||
:param num_hard_negatives: Maximum number of hard negative context passages in a sample.
|
||||
:param num_positives: Maximum number of positive context passages in a sample.
|
||||
:param shuffle_negatives: Whether to shuffle all the hard_negative passages before selecting the
|
||||
@ -1296,6 +1302,8 @@ class TableTextSimilarityProcessor(Processor):
|
||||
"""
|
||||
# TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs
|
||||
|
||||
if embed_meta_fields is None:
|
||||
embed_meta_fields = ["page_title", "section_title", "caption"]
|
||||
# Custom processor attributes
|
||||
self.max_samples = max_samples
|
||||
self.query_tokenizer = query_tokenizer
|
||||
@ -1511,7 +1519,7 @@ class TableTextSimilarityProcessor(Processor):
|
||||
return standard_dicts
|
||||
|
||||
def dataset_from_dicts(
|
||||
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
|
||||
self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False
|
||||
):
|
||||
"""
|
||||
Convert input dictionaries into a pytorch dataset for TextSimilarity.
|
||||
@ -1533,7 +1541,8 @@ class TableTextSimilarityProcessor(Processor):
|
||||
:param indices: list, indices used during multiprocessing so that IDs assigned to our baskets is unique
|
||||
:param return_baskets: boolean, whether to return the baskets or not (baskets are needed during inference)
|
||||
"""
|
||||
|
||||
if indices is None:
|
||||
indices = []
|
||||
# Take the dict and insert into our basket structure, this stages also adds an internal IDs
|
||||
baskets = self._fill_baskets(dicts, indices)
|
||||
|
||||
@ -1861,8 +1870,10 @@ class TextClassificationProcessor(Processor):
|
||||
raise NotImplementedError
|
||||
|
||||
def dataset_from_dicts(
|
||||
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
|
||||
self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False
|
||||
):
|
||||
if indices is None:
|
||||
indices = []
|
||||
baskets = []
|
||||
# Tokenize in batches
|
||||
texts = [x["text"] for x in dicts]
|
||||
@ -2043,10 +2054,12 @@ class UnlabeledTextProcessor(Processor):
|
||||
test_filename: Optional[Union[Path, str]] = None,
|
||||
dev_split: float = 0,
|
||||
data_dir: Optional[Union[Path, str]] = None,
|
||||
tasks: Dict = {},
|
||||
tasks: Optional[Dict] = None,
|
||||
proxies: Optional[Dict] = None,
|
||||
multithreading_rust: Optional[bool] = True,
|
||||
):
|
||||
if tasks is None:
|
||||
tasks = {}
|
||||
super().__init__(
|
||||
tokenizer,
|
||||
max_seq_len,
|
||||
@ -2069,8 +2082,10 @@ class UnlabeledTextProcessor(Processor):
|
||||
return dicts
|
||||
|
||||
def dataset_from_dicts(
|
||||
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
|
||||
self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False
|
||||
):
|
||||
if indices is None:
|
||||
indices = []
|
||||
if return_baskets:
|
||||
raise NotImplementedError("return_baskets is not supported by UnlabeledTextProcessor")
|
||||
texts = [dict_["text"] for dict_ in dicts]
|
||||
|
@ -42,8 +42,8 @@ class BiAdaptiveModel(nn.Module):
|
||||
prediction_heads: List[PredictionHead],
|
||||
embeds_dropout_prob: float = 0.1,
|
||||
device: torch.device = torch.device("cuda"),
|
||||
lm1_output_types: Union[str, List[str]] = ["per_sequence"],
|
||||
lm2_output_types: Union[str, List[str]] = ["per_sequence"],
|
||||
lm1_output_types: Optional[Union[str, List[str]]] = None,
|
||||
lm2_output_types: Optional[Union[str, List[str]]] = None,
|
||||
loss_aggregation_fn: Optional[Callable] = None,
|
||||
):
|
||||
"""
|
||||
@ -54,12 +54,12 @@ class BiAdaptiveModel(nn.Module):
|
||||
language models will be zeroed.
|
||||
:param lm1_output_types: How to extract the embeddings from the final layer of the first language model. When set
|
||||
to "per_token", one embedding will be extracted per input token. If set to
|
||||
"per_sequence", a single embedding will be extracted to represent the full
|
||||
"per_sequence" (default), a single embedding will be extracted to represent the full
|
||||
input sequence. Can either be a single string, or a list of strings,
|
||||
one for each prediction head.
|
||||
:param lm2_output_types: How to extract the embeddings from the final layer of the second language model. When set
|
||||
to "per_token", one embedding will be extracted per input token. If set to
|
||||
"per_sequence", a single embedding will be extracted to represent the full
|
||||
"per_sequence" (default), a single embedding will be extracted to represent the full
|
||||
input sequence. Can either be a single string, or a list of strings,
|
||||
one for each prediction head.
|
||||
:param device: The device on which this model will operate. Either torch.device("cpu") or torch.device("cuda").
|
||||
@ -74,6 +74,10 @@ class BiAdaptiveModel(nn.Module):
|
||||
Note: The loss at this stage is per sample, i.e one tensor of
|
||||
shape (batchsize) per prediction head.
|
||||
"""
|
||||
if lm1_output_types is None:
|
||||
lm1_output_types = ["per_sequence"]
|
||||
if lm2_output_types is None:
|
||||
lm2_output_types = ["per_sequence"]
|
||||
super(BiAdaptiveModel, self).__init__()
|
||||
|
||||
self.device = device
|
||||
|
@ -231,7 +231,7 @@ class QuestionAnsweringHead(PredictionHead):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
layer_dims: List[int] = [768, 2],
|
||||
layer_dims: Optional[List[int]] = None,
|
||||
task_name: str = "question_answering",
|
||||
no_ans_boost: float = 0.0,
|
||||
context_window_size: int = 100,
|
||||
@ -244,7 +244,7 @@ class QuestionAnsweringHead(PredictionHead):
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
:param layer_dims: dimensions of Feed Forward block, e.g. [768,2], for adjusting to BERT embedding. Output should be always 2
|
||||
:param layer_dims: dimensions of Feed Forward block, e.g. [768,2] used by default, for adjusting to BERT embedding. Output should be always 2
|
||||
:param kwargs: placeholder for passing generic parameters
|
||||
:param no_ans_boost: How much the no_answer logit is boosted/increased.
|
||||
The higher the value, the more likely a "no answer possible given the input text" is returned by the model
|
||||
@ -260,6 +260,8 @@ class QuestionAnsweringHead(PredictionHead):
|
||||
:param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence.
|
||||
Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default).
|
||||
"""
|
||||
if layer_dims is None:
|
||||
layer_dims = [768, 2]
|
||||
super(QuestionAnsweringHead, self).__init__()
|
||||
if len(kwargs) > 0:
|
||||
logger.warning(
|
||||
|
@ -248,7 +248,7 @@ class QAPred(Pred):
|
||||
aggregation_level: str,
|
||||
no_answer_gap: float,
|
||||
ground_truth_answer: Optional[str] = None,
|
||||
answer_types: List[str] = [],
|
||||
answer_types: Optional[List[str]] = None,
|
||||
):
|
||||
"""
|
||||
:param id: The id of the passage or document
|
||||
@ -262,6 +262,8 @@ class QAPred(Pred):
|
||||
:param ground_truth_answer: Ground truth answers
|
||||
:param answer_types: List of answer_types supported by this task e.g. ["span", "yes_no", "no_answer"]
|
||||
"""
|
||||
if answer_types is None:
|
||||
answer_types = []
|
||||
super().__init__(id, prediction, context)
|
||||
self.question = question
|
||||
self.token_offsets = token_offsets
|
||||
|
@ -44,9 +44,9 @@ class TriAdaptiveModel(nn.Module):
|
||||
prediction_heads: List[PredictionHead],
|
||||
embeds_dropout_prob: float = 0.1,
|
||||
device: torch.device = torch.device("cuda"),
|
||||
lm1_output_types: Union[str, List[str]] = ["per_sequence"],
|
||||
lm2_output_types: Union[str, List[str]] = ["per_sequence"],
|
||||
lm3_output_types: Union[str, List[str]] = ["per_sequence"],
|
||||
lm1_output_types: Optional[Union[str, List[str]]] = None,
|
||||
lm2_output_types: Optional[Union[str, List[str]]] = None,
|
||||
lm3_output_types: Optional[Union[str, List[str]]] = None,
|
||||
loss_aggregation_fn: Optional[Callable] = None,
|
||||
):
|
||||
"""
|
||||
@ -58,17 +58,17 @@ class TriAdaptiveModel(nn.Module):
|
||||
language model will be zeroed.
|
||||
:param lm1_output_types: How to extract the embeddings from the final layer of the first language model. When set
|
||||
to "per_token", one embedding will be extracted per input token. If set to
|
||||
"per_sequence", a single embedding will be extracted to represent the full
|
||||
"per_sequence" (default), a single embedding will be extracted to represent the full
|
||||
input sequence. Can either be a single string, or a list of strings,
|
||||
one for each prediction head.
|
||||
:param lm2_output_types: How to extract the embeddings from the final layer of the second language model. When set
|
||||
to "per_token", one embedding will be extracted per input token. If set to
|
||||
"per_sequence", a single embedding will be extracted to represent the full
|
||||
"per_sequence" (default), a single embedding will be extracted to represent the full
|
||||
input sequence. Can either be a single string, or a list of strings,
|
||||
one for each prediction head.
|
||||
:param lm3_output_types: How to extract the embeddings from the final layer of the third language model. When set
|
||||
to "per_token", one embedding will be extracted per input token. If set to
|
||||
"per_sequence", a single embedding will be extracted to represent the full
|
||||
"per_sequence" (default), a single embedding will be extracted to represent the full
|
||||
input sequence. Can either be a single string, or a list of strings,
|
||||
one for each prediction head.
|
||||
:param device: The device on which this model will operate. Either torch.device("cpu") or torch.device("cuda").
|
||||
@ -83,7 +83,12 @@ class TriAdaptiveModel(nn.Module):
|
||||
Note: The loss at this stage is per sample, i.e one tensor of
|
||||
shape (batchsize) per prediction head.
|
||||
"""
|
||||
|
||||
if lm1_output_types is None:
|
||||
lm1_output_types = ["per_sequence"]
|
||||
if lm2_output_types is None:
|
||||
lm2_output_types = ["per_sequence"]
|
||||
if lm3_output_types is None:
|
||||
lm3_output_types = ["per_sequence"]
|
||||
super(TriAdaptiveModel, self).__init__()
|
||||
self.device = device
|
||||
self.language_model1 = language_model1.to(device)
|
||||
|
@ -261,10 +261,12 @@ def create_schema_for_node_class(node_class: Type[BaseComponent]) -> Tuple[Dict[
|
||||
return component_schema, {"$ref": f"#/definitions/{component_name}"}
|
||||
|
||||
|
||||
def get_json_schema(filename: str, version: str, modules: List[str] = ["haystack.document_stores", "haystack.nodes"]):
|
||||
def get_json_schema(filename: str, version: str, modules: Optional[List[str]] = None):
|
||||
"""
|
||||
Generate JSON schema for Haystack pipelines.
|
||||
"""
|
||||
if modules is None:
|
||||
modules = ["haystack.document_stores", "haystack.nodes"]
|
||||
schema_definitions = {} # All the schemas for the node and accessory classes
|
||||
node_refs = [] # References to the nodes only (accessory classes cannot be listed among the nodes in a config)
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
import mimetypes
|
||||
from typing import Any, Dict, List, Union
|
||||
from typing import Any, Dict, List, Union, Optional
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
@ -29,14 +29,16 @@ class FileTypeClassifier(BaseComponent):
|
||||
|
||||
outgoing_edges = len(DEFAULT_TYPES)
|
||||
|
||||
def __init__(self, supported_types: List[str] = DEFAULT_TYPES):
|
||||
def __init__(self, supported_types: Optional[List[str]] = None):
|
||||
"""
|
||||
Node that sends out files on a different output edge depending on their extension.
|
||||
|
||||
:param supported_types: The file types that this node can distinguish between.
|
||||
The default values are: `txt`, `pdf`, `md`, `docx`, and `html`.
|
||||
If no value is provided, the value created by default comprises: `txt`, `pdf`, `md`, `docx`, and `html`.
|
||||
Lists with duplicate elements are not allowed.
|
||||
"""
|
||||
if supported_types is None:
|
||||
supported_types = DEFAULT_TYPES
|
||||
if len(set(supported_types)) != len(supported_types):
|
||||
duplicates = supported_types
|
||||
for item in set(supported_types):
|
||||
|
@ -137,7 +137,7 @@ class BaseConverter(BaseComponent):
|
||||
file_paths: Union[Path, List[Path]],
|
||||
meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
known_ligatures: Dict[str, str] = KNOWN_LIGATURES,
|
||||
known_ligatures: Optional[Dict[str, str]] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "UTF-8",
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
@ -153,12 +153,13 @@ class BaseConverter(BaseComponent):
|
||||
does not have table parsing capability for finding answers. However, tables
|
||||
may also have long strings that could possible candidate for searching answers.
|
||||
The rows containing strings are thus retained in this option.
|
||||
:param known_ligatures: Some converters tends to recognize clusters of letters as ligatures, such as "ff" (double f).
|
||||
:param known_ligatures: Some converters tend to recognize clusters of letters as ligatures, such as "ff" (double f).
|
||||
Such ligatures however make text hard to compare with the content of other files,
|
||||
which are generally ligature free. Therefore we automatically find and replace the most
|
||||
common ligatures with their split counterparts. The default mapping is in
|
||||
`haystack.nodes.file_converter.base.KNOWN_LIGATURES`: it is rather biased towards Latin alphabeths
|
||||
but excludes all ligatures that are known to be used in IPA.
|
||||
If no value is provided, this default is created and used.
|
||||
You can use this parameter to provide your own set of ligatures to clean up from the documents.
|
||||
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
|
||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||
@ -171,6 +172,8 @@ class BaseConverter(BaseComponent):
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
"""
|
||||
if known_ligatures is None:
|
||||
known_ligatures = KNOWN_LIGATURES
|
||||
|
||||
if isinstance(file_paths, Path):
|
||||
file_paths = [file_paths]
|
||||
@ -206,7 +209,7 @@ class BaseConverter(BaseComponent):
|
||||
file_paths: Union[Path, List[Path]],
|
||||
meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
known_ligatures: Dict[str, str] = KNOWN_LIGATURES,
|
||||
known_ligatures: Optional[Dict[str, str]] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "UTF-8",
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
|
@ -24,7 +24,7 @@ class ImageToTextConverter(BaseConverter):
|
||||
def __init__(
|
||||
self,
|
||||
remove_numeric_tables: bool = False,
|
||||
valid_languages: Optional[List[str]] = ["eng"],
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
):
|
||||
"""
|
||||
@ -37,7 +37,8 @@ class ImageToTextConverter(BaseConverter):
|
||||
(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html)
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text. Run the following line of code to check available language packs:
|
||||
in garbled text. If no value is provided, English will be set as default.
|
||||
Run the following line of code to check available language packs:
|
||||
# List of available languages
|
||||
print(pytesseract.get_languages(config=''))
|
||||
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
|
||||
@ -45,6 +46,8 @@ class ImageToTextConverter(BaseConverter):
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
"""
|
||||
if valid_languages is None:
|
||||
valid_languages = ["eng"]
|
||||
super().__init__(
|
||||
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
|
||||
)
|
||||
|
@ -208,7 +208,7 @@ class PDFToTextOCRConverter(BaseConverter):
|
||||
def __init__(
|
||||
self,
|
||||
remove_numeric_tables: bool = False,
|
||||
valid_languages: Optional[List[str]] = ["eng"],
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
):
|
||||
"""
|
||||
@ -223,12 +223,14 @@ class PDFToTextOCRConverter(BaseConverter):
|
||||
(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
in garbled text. If no value is provided, English will be set as default.
|
||||
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
"""
|
||||
if valid_languages is None:
|
||||
valid_languages = ["eng"]
|
||||
# init image to text instance
|
||||
self.image_2_text = ImageToTextConverter(remove_numeric_tables, valid_languages)
|
||||
|
||||
|
@ -21,7 +21,7 @@ class BasePreProcessor(BaseComponent):
|
||||
clean_whitespace: Optional[bool] = True,
|
||||
clean_header_footer: Optional[bool] = False,
|
||||
clean_empty_lines: Optional[bool] = True,
|
||||
remove_substrings: List[str] = [],
|
||||
remove_substrings: Optional[List[str]] = None,
|
||||
split_by: Literal["word", "sentence", "passage", None] = "word",
|
||||
split_length: Optional[int] = 1000,
|
||||
split_overlap: Optional[int] = None,
|
||||
@ -41,7 +41,7 @@ class BasePreProcessor(BaseComponent):
|
||||
clean_whitespace: bool,
|
||||
clean_header_footer: bool,
|
||||
clean_empty_lines: bool,
|
||||
remove_substrings: List[str],
|
||||
remove_substrings: Optional[List[str]],
|
||||
) -> Document:
|
||||
raise NotImplementedError
|
||||
|
||||
|
@ -54,7 +54,7 @@ class PreProcessor(BasePreProcessor):
|
||||
clean_whitespace: bool = True,
|
||||
clean_header_footer: bool = False,
|
||||
clean_empty_lines: bool = True,
|
||||
remove_substrings: List[str] = [],
|
||||
remove_substrings: Optional[List[str]] = None,
|
||||
split_by: Optional[Literal["word", "sentence", "passage"]] = "word",
|
||||
split_length: int = 200,
|
||||
split_overlap: int = 0,
|
||||
@ -73,7 +73,7 @@ class PreProcessor(BasePreProcessor):
|
||||
or similar.
|
||||
:param clean_whitespace: Strip whitespaces before or after each line in the text.
|
||||
:param clean_empty_lines: Remove more than two empty lines in the text.
|
||||
:param remove_substrings: Remove specified substrings from the text.
|
||||
:param remove_substrings: Remove specified substrings from the text. If no value is provided an empty list is created by default.
|
||||
:param split_by: Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting.
|
||||
:param split_length: Max. number of the above split unit (e.g. words) that are allowed in one document. For instance, if n -> 10 & split_by ->
|
||||
"sentence", then each output document will have 10 sentences.
|
||||
@ -100,6 +100,8 @@ class PreProcessor(BasePreProcessor):
|
||||
`AzureConverter`.
|
||||
:param max_chars_check: the maximum length a document is expected to have. Each document that is longer than max_chars_check in characters after pre-processing will raise a warning.
|
||||
"""
|
||||
if remove_substrings is None:
|
||||
remove_substrings = []
|
||||
super().__init__()
|
||||
|
||||
try:
|
||||
@ -132,7 +134,7 @@ class PreProcessor(BasePreProcessor):
|
||||
clean_whitespace: Optional[bool] = None,
|
||||
clean_header_footer: Optional[bool] = None,
|
||||
clean_empty_lines: Optional[bool] = None,
|
||||
remove_substrings: List[str] = [],
|
||||
remove_substrings: Optional[List[str]] = None,
|
||||
split_by: Optional[Literal["word", "sentence", "passage"]] = None,
|
||||
split_length: Optional[int] = None,
|
||||
split_overlap: Optional[int] = None,
|
||||
@ -143,6 +145,8 @@ class PreProcessor(BasePreProcessor):
|
||||
"""
|
||||
Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents.
|
||||
"""
|
||||
if remove_substrings is None:
|
||||
remove_substrings = []
|
||||
if not isinstance(documents, list):
|
||||
warnings.warn(
|
||||
"Using a single Document as argument to the 'documents' parameter is deprecated. Use a list "
|
||||
@ -197,14 +201,15 @@ class PreProcessor(BasePreProcessor):
|
||||
clean_whitespace: Optional[bool] = None,
|
||||
clean_header_footer: Optional[bool] = None,
|
||||
clean_empty_lines: Optional[bool] = None,
|
||||
remove_substrings: List[str] = [],
|
||||
remove_substrings: Optional[List[str]] = None,
|
||||
split_by: Optional[Literal["word", "sentence", "passage"]] = None,
|
||||
split_length: Optional[int] = None,
|
||||
split_overlap: Optional[int] = None,
|
||||
split_respect_sentence_boundary: Optional[bool] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
) -> List[Document]:
|
||||
|
||||
if remove_substrings is None:
|
||||
remove_substrings = []
|
||||
if clean_whitespace is None:
|
||||
clean_whitespace = self.clean_whitespace
|
||||
if clean_header_footer is None:
|
||||
@ -258,13 +263,15 @@ class PreProcessor(BasePreProcessor):
|
||||
clean_whitespace: bool,
|
||||
clean_header_footer: bool,
|
||||
clean_empty_lines: bool,
|
||||
remove_substrings: List[str],
|
||||
remove_substrings: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
) -> Document:
|
||||
"""
|
||||
Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
|
||||
and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__().
|
||||
"""
|
||||
if remove_substrings is None:
|
||||
remove_substrings = []
|
||||
if id_hash_keys is None:
|
||||
id_hash_keys = self.id_hash_keys
|
||||
|
||||
|
@ -67,7 +67,7 @@ class TransformersQueryClassifier(BaseQueryClassifier):
|
||||
tokenizer: Optional[str] = None,
|
||||
use_gpu: bool = True,
|
||||
task: str = "text-classification",
|
||||
labels: List[str] = DEFAULT_LABELS,
|
||||
labels: Optional[List[str]] = None,
|
||||
batch_size: int = 16,
|
||||
progress_bar: bool = True,
|
||||
use_auth_token: Optional[Union[str, bool]] = None,
|
||||
@ -96,6 +96,8 @@ class TransformersQueryClassifier(BaseQueryClassifier):
|
||||
[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
|
||||
parameter is not used and a single cpu device is used for inference.
|
||||
"""
|
||||
if labels is None:
|
||||
labels = DEFAULT_LABELS
|
||||
super().__init__()
|
||||
resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||
if len(resolved_devices) > 1:
|
||||
|
@ -176,7 +176,7 @@ class FARMReader(BaseReader):
|
||||
dev_filename: Optional[str] = None,
|
||||
test_filename: Optional[str] = None,
|
||||
use_gpu: Optional[bool] = None,
|
||||
devices: List[torch.device] = [],
|
||||
devices: Optional[List[torch.device]] = None,
|
||||
batch_size: int = 10,
|
||||
n_epochs: int = 2,
|
||||
learning_rate: float = 1e-5,
|
||||
@ -205,6 +205,8 @@ class FARMReader(BaseReader):
|
||||
doc_stride: Optional[int] = None,
|
||||
max_query_length: Optional[int] = None,
|
||||
):
|
||||
if devices is None:
|
||||
devices = []
|
||||
if dev_filename:
|
||||
dev_split = 0
|
||||
|
||||
@ -363,7 +365,7 @@ class FARMReader(BaseReader):
|
||||
dev_filename: Optional[str] = None,
|
||||
test_filename: Optional[str] = None,
|
||||
use_gpu: Optional[bool] = None,
|
||||
devices: List[torch.device] = [],
|
||||
devices: Optional[List[torch.device]] = None,
|
||||
batch_size: int = 10,
|
||||
n_epochs: int = 2,
|
||||
learning_rate: float = 1e-5,
|
||||
@ -469,7 +471,7 @@ class FARMReader(BaseReader):
|
||||
dev_filename: Optional[str] = None,
|
||||
test_filename: Optional[str] = None,
|
||||
use_gpu: Optional[bool] = None,
|
||||
devices: List[torch.device] = [],
|
||||
devices: Optional[List[torch.device]] = None,
|
||||
batch_size: int = 10,
|
||||
teacher_batch_size: Optional[int] = None,
|
||||
n_epochs: int = 2,
|
||||
@ -595,7 +597,7 @@ class FARMReader(BaseReader):
|
||||
dev_filename: Optional[str] = None,
|
||||
test_filename: Optional[str] = None,
|
||||
use_gpu: Optional[bool] = None,
|
||||
devices: List[torch.device] = [],
|
||||
devices: Optional[List[torch.device]] = None,
|
||||
batch_size: int = 10,
|
||||
teacher_batch_size: Optional[int] = None,
|
||||
n_epochs: int = 5,
|
||||
|
@ -794,7 +794,7 @@ class TableTextRetriever(DenseRetriever):
|
||||
top_k: int = 10,
|
||||
use_gpu: bool = True,
|
||||
batch_size: int = 16,
|
||||
embed_meta_fields: List[str] = ["name", "section_title", "caption"],
|
||||
embed_meta_fields: Optional[List[str]] = None,
|
||||
use_fast_tokenizers: bool = True,
|
||||
similarity_function: str = "dot_product",
|
||||
global_loss_buffer_size: int = 150000,
|
||||
@ -825,7 +825,8 @@ class TableTextRetriever(DenseRetriever):
|
||||
then used to create the embedding.
|
||||
This is the approach used in the original paper and is likely to improve
|
||||
performance if your titles contain meaningful information for retrieval
|
||||
(topic, entities etc.).
|
||||
(topic, entities etc.). If no value is provided, a default will be created.
|
||||
That default embeds name, section title and caption.
|
||||
:param use_fast_tokenizers: Whether to use fast Rust tokenizers
|
||||
:param similarity_function: Which function to apply for calculating the similarity of query and passage embeddings during training.
|
||||
Options: `dot_product` (Default) or `cosine`
|
||||
@ -849,6 +850,8 @@ class TableTextRetriever(DenseRetriever):
|
||||
Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
|
||||
:param use_fast: Whether to use the fast version of DPR tokenizers or fallback to the standard version. Defaults to True.
|
||||
"""
|
||||
if embed_meta_fields is None:
|
||||
embed_meta_fields = ["name", "section_title", "caption"]
|
||||
super().__init__()
|
||||
|
||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=True)
|
||||
@ -1225,7 +1228,7 @@ class TableTextRetriever(DenseRetriever):
|
||||
max_processes: int = 128,
|
||||
dev_split: float = 0,
|
||||
batch_size: int = 2,
|
||||
embed_meta_fields: List[str] = ["page_title", "section_title", "caption"],
|
||||
embed_meta_fields: Optional[List[str]] = None,
|
||||
num_hard_negatives: int = 1,
|
||||
num_positives: int = 1,
|
||||
n_epochs: int = 3,
|
||||
@ -1260,7 +1263,7 @@ class TableTextRetriever(DenseRetriever):
|
||||
:param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None.
|
||||
:param batch_size: Total number of samples in 1 batch of data.
|
||||
:param embed_meta_fields: Concatenate meta fields with each passage and table.
|
||||
The default setting in official MMRetrieval embeds page title,
|
||||
If no value is provided, a default will be created. That default embeds page title,
|
||||
section title and caption with the corresponding table and title with
|
||||
corresponding text passage.
|
||||
:param num_hard_negatives: Number of hard negative passages (passages which are
|
||||
@ -1290,6 +1293,8 @@ class TableTextRetriever(DenseRetriever):
|
||||
:param checkpoints_to_keep: The maximum number of train checkpoints to save.
|
||||
:param early_stopping: An initialized EarlyStopping object to control early stopping and saving of the best models.
|
||||
"""
|
||||
if embed_meta_fields is None:
|
||||
embed_meta_fields = ["page_title", "section_title", "caption"]
|
||||
|
||||
self.processor.embed_meta_fields = embed_meta_fields
|
||||
self.processor.data_dir = Path(data_dir)
|
||||
@ -1393,7 +1398,7 @@ class TableTextRetriever(DenseRetriever):
|
||||
max_seq_len_table: int = 256,
|
||||
use_gpu: bool = True,
|
||||
batch_size: int = 16,
|
||||
embed_meta_fields: List[str] = ["name", "section_title", "caption"],
|
||||
embed_meta_fields: Optional[List[str]] = None,
|
||||
use_fast_tokenizers: bool = True,
|
||||
similarity_function: str = "dot_product",
|
||||
query_encoder_dir: str = "query_encoder",
|
||||
@ -1403,6 +1408,8 @@ class TableTextRetriever(DenseRetriever):
|
||||
"""
|
||||
Load TableTextRetriever from the specified directory.
|
||||
"""
|
||||
if embed_meta_fields is None:
|
||||
embed_meta_fields = ["name", "section_title", "caption"]
|
||||
|
||||
load_dir = Path(load_dir)
|
||||
mm_retriever = cls(
|
||||
@ -1441,7 +1448,7 @@ class EmbeddingRetriever(DenseRetriever):
|
||||
devices: Optional[List[Union[str, torch.device]]] = None,
|
||||
use_auth_token: Optional[Union[str, bool]] = None,
|
||||
scale_score: bool = True,
|
||||
embed_meta_fields: List[str] = [],
|
||||
embed_meta_fields: Optional[List[str]] = None,
|
||||
api_key: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
@ -1494,10 +1501,13 @@ class EmbeddingRetriever(DenseRetriever):
|
||||
This approach is also used in the TableTextRetriever paper and is likely to improve
|
||||
performance if your titles contain meaningful information for retrieval
|
||||
(topic, entities etc.).
|
||||
If no value is provided, a default empty list will be created.
|
||||
:param api_key: The OpenAI API key or the Cohere API key. Required if one wants to use OpenAI/Cohere embeddings.
|
||||
For more details see https://beta.openai.com/account/api-keys and https://dashboard.cohere.ai/api-keys
|
||||
|
||||
"""
|
||||
if embed_meta_fields is None:
|
||||
embed_meta_fields = []
|
||||
super().__init__()
|
||||
|
||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=True)
|
||||
@ -1929,7 +1939,7 @@ class MultihopEmbeddingRetriever(EmbeddingRetriever):
|
||||
devices: Optional[List[Union[str, torch.device]]] = None,
|
||||
use_auth_token: Optional[Union[str, bool]] = None,
|
||||
scale_score: bool = True,
|
||||
embed_meta_fields: List[str] = [],
|
||||
embed_meta_fields: Optional[List[str]] = None,
|
||||
):
|
||||
"""
|
||||
:param document_store: An instance of DocumentStore from which to retrieve documents.
|
||||
@ -1977,7 +1987,10 @@ class MultihopEmbeddingRetriever(EmbeddingRetriever):
|
||||
This approach is also used in the TableTextRetriever paper and is likely to improve
|
||||
performance if your titles contain meaningful information for retrieval
|
||||
(topic, entities etc.).
|
||||
If no value is provided, a default empty list will be created.
|
||||
"""
|
||||
if embed_meta_fields is None:
|
||||
embed_meta_fields = []
|
||||
super().__init__(
|
||||
embedding_model=embedding_model,
|
||||
document_store=document_store,
|
||||
|
@ -44,7 +44,7 @@ class MultiModalEmbedder:
|
||||
embedding_models: Dict[str, Union[Path, str]], # replace str with ContentTypes starting from Python3.8
|
||||
feature_extractors_params: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||
batch_size: int = 16,
|
||||
embed_meta_fields: List[str] = ["name"],
|
||||
embed_meta_fields: Optional[List[str]] = None,
|
||||
progress_bar: bool = True,
|
||||
devices: Optional[List[Union[str, torch.device]]] = None,
|
||||
use_auth_token: Optional[Union[str, bool]] = None,
|
||||
@ -67,6 +67,7 @@ class MultiModalEmbedder:
|
||||
This is the approach used in the original paper and is likely to improve
|
||||
performance if your titles contain meaningful information for retrieval
|
||||
(topic, entities etc.).
|
||||
If no value is provided, a default with "name" as embedding field is created.
|
||||
:param progress_bar: Whether to show a tqdm progress bar or not.
|
||||
Can be helpful to disable in production deployments to keep the logs clean.
|
||||
:param devices: List of GPU (or CPU) devices to limit inference to certain GPUs and not use all available ones.
|
||||
@ -78,6 +79,8 @@ class MultiModalEmbedder:
|
||||
the local token is used, which must be previously created using `transformer-cli login`.
|
||||
For more information, see [Hugging Face documentation](https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained)
|
||||
"""
|
||||
if embed_meta_fields is None:
|
||||
embed_meta_fields = ["name"]
|
||||
super().__init__()
|
||||
|
||||
self.devices = get_devices(devices)
|
||||
|
@ -22,11 +22,11 @@ class MultiModalRetriever(DenseRetriever):
|
||||
query_embedding_model: Union[Path, str],
|
||||
document_embedding_models: Dict[str, Union[Path, str]], # Replace str with ContentTypes starting Python3.8
|
||||
query_type: str = "text", # Replace str with ContentTypes starting Python3.8
|
||||
query_feature_extractor_params: Dict[str, Any] = {"max_length": 64},
|
||||
document_feature_extractors_params: Dict[str, Dict[str, Any]] = {"text": {"max_length": 256}},
|
||||
query_feature_extractor_params: Optional[Dict[str, Any]] = None,
|
||||
document_feature_extractors_params: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||
top_k: int = 10,
|
||||
batch_size: int = 16,
|
||||
embed_meta_fields: List[str] = ["name"],
|
||||
embed_meta_fields: Optional[List[str]] = None,
|
||||
similarity_function: str = "dot_product",
|
||||
progress_bar: bool = True,
|
||||
devices: Optional[List[Union[str, torch.device]]] = None,
|
||||
@ -46,14 +46,14 @@ class MultiModalRetriever(DenseRetriever):
|
||||
checkpoint with the content type it should handle ("text", "table", "image", and so on).
|
||||
The format equals the one used by Hugging Face transformers' modelhub models.
|
||||
:param query_type: The content type of the query ("text", "image" and so on).
|
||||
:param query_feature_extraction_params: The parameters to pass to the feature extractor of the query.
|
||||
:param document_feature_extraction_params: The parameters to pass to the feature extractor of the documents.
|
||||
:param query_feature_extraction_params: The parameters to pass to the feature extractor of the query. If no value is provided, a default dictionary with "max_length": 64 will be set.
|
||||
:param document_feature_extraction_params: The parameters to pass to the feature extractor of the documents. If no value is provided, a default dictionary with "text": {"max_length": 256} will be set.
|
||||
:param top_k: How many documents to return per query.
|
||||
:param batch_size: Number of questions or documents to encode at once. For multiple GPUs, this is
|
||||
the total batch size.
|
||||
:param embed_meta_fields: Concatenate the provided meta fields to a (text) pair that is then used to create
|
||||
the embedding. This is likely to improve performance if your titles contain meaningful information
|
||||
for retrieval (topic, entities, and so on). Note that only text and table documents support this feature.
|
||||
for retrieval (topic, entities, and so on). Note that only text and table documents support this feature. If no values is provided, a default with "name" as embedding field will be created.
|
||||
:param similarity_function: Which function to apply for calculating the similarity of query and document
|
||||
embeddings during training. Options: `dot_product` (default) or `cosine`.
|
||||
:param progress_bar: Whether to show a tqdm progress bar or not.
|
||||
@ -72,6 +72,12 @@ class MultiModalRetriever(DenseRetriever):
|
||||
range are scaled to a range of [0,1], where 1 means extremely relevant.
|
||||
Otherwise raw similarity scores (for example, cosine or dot_product) are used.
|
||||
"""
|
||||
if query_feature_extractor_params is None:
|
||||
query_feature_extractor_params = {"max_length": 64}
|
||||
if document_feature_extractors_params is None:
|
||||
document_feature_extractors_params = {"text": {"max_length": 256}}
|
||||
if embed_meta_fields is None:
|
||||
embed_meta_fields = ["name"]
|
||||
super().__init__()
|
||||
|
||||
self.similarity_function = similarity_function
|
||||
|
@ -745,12 +745,12 @@ class Pipeline:
|
||||
cls,
|
||||
index_pipeline: Pipeline,
|
||||
query_pipeline: Pipeline,
|
||||
index_params: dict = {},
|
||||
query_params: dict = {},
|
||||
index_params: Optional[Dict] = None,
|
||||
query_params: Optional[Dict] = None,
|
||||
dataset: str = "scifact",
|
||||
dataset_dir: Path = Path("."),
|
||||
num_documents: Optional[int] = None,
|
||||
top_k_values: List[int] = [1, 3, 5, 10, 100, 1000],
|
||||
top_k_values: Optional[List[int]] = None,
|
||||
keep_index: bool = False,
|
||||
) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float]]:
|
||||
"""
|
||||
@ -765,7 +765,7 @@ class Pipeline:
|
||||
:param dataset_dir: The directory to store the dataset to.
|
||||
:param num_documents: Maximum number of documents to load from given dataset. If set to None (default)
|
||||
or to a value larger than the number of documents in the dataset, the full dataset is loaded.
|
||||
:param top_k_values: The top_k values each metric will be calculated for.
|
||||
:param top_k_values: The top_k values each metric will be calculated for. By default, the values are 1, 3, 5, 10, 100, and 1000.
|
||||
:param keep_index: Whether to keep the index after evaluation.
|
||||
If True the index will be kept after beir evaluation. Otherwise it will be deleted immediately afterwards.
|
||||
Defaults to False.
|
||||
@ -773,6 +773,12 @@ class Pipeline:
|
||||
Returns a tuple containing the ncdg, map, recall and precision scores.
|
||||
Each metric is represented by a dictionary containing the scores for each top_k value.
|
||||
"""
|
||||
if index_params is None:
|
||||
index_params = {}
|
||||
if query_params is None:
|
||||
query_params = {}
|
||||
if top_k_values is None:
|
||||
top_k_values = [1, 3, 5, 10, 100, 1000]
|
||||
try:
|
||||
from beir import util
|
||||
from beir.datasets.data_loader import GenericDataLoader
|
||||
@ -855,11 +861,11 @@ class Pipeline:
|
||||
experiment_tracking_tool: Literal["mlflow", None] = None,
|
||||
experiment_tracking_uri: Optional[str] = None,
|
||||
corpus_file_metas: Optional[List[Dict[str, Any]]] = None,
|
||||
corpus_meta: Dict[str, Any] = {},
|
||||
evaluation_set_meta: Dict[str, Any] = {},
|
||||
pipeline_meta: Dict[str, Any] = {},
|
||||
index_params: dict = {},
|
||||
query_params: dict = {},
|
||||
corpus_meta: Optional[Dict[str, Any]] = None,
|
||||
evaluation_set_meta: Optional[Dict[str, Any]] = None,
|
||||
pipeline_meta: Optional[Dict[str, Any]] = None,
|
||||
index_params: Optional[Dict] = None,
|
||||
query_params: Optional[Dict] = None,
|
||||
sas_model_name_or_path: Optional[str] = None,
|
||||
sas_batch_size: int = 32,
|
||||
sas_use_gpu: bool = True,
|
||||
@ -997,6 +1003,17 @@ class Pipeline:
|
||||
Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scoring ~75 in total.
|
||||
:param context_matching_threshold: Score threshold that candidates must surpass to be included into the result list. Range: [0,100]
|
||||
"""
|
||||
if corpus_meta is None:
|
||||
corpus_meta = {}
|
||||
if evaluation_set_meta is None:
|
||||
evaluation_set_meta = {}
|
||||
if pipeline_meta is None:
|
||||
pipeline_meta = {}
|
||||
if index_params is None:
|
||||
index_params = {}
|
||||
if query_params is None:
|
||||
query_params = {}
|
||||
|
||||
if experiment_tracking_tool is not None:
|
||||
tracking_head_cls = TRACKING_TOOL_TO_HEAD.get(experiment_tracking_tool, None)
|
||||
if tracking_head_cls is None:
|
||||
@ -2213,7 +2230,7 @@ class Pipeline:
|
||||
"document_id_or_answer",
|
||||
] = "document_id_or_answer",
|
||||
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
||||
wrong_examples_fields: List[str] = ["answer", "context", "document_id"],
|
||||
wrong_examples_fields: Optional[List[str]] = None,
|
||||
max_characters_per_field: int = 150,
|
||||
):
|
||||
"""
|
||||
@ -2249,9 +2266,11 @@ class Pipeline:
|
||||
- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
|
||||
The default value is 'any'.
|
||||
In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
|
||||
:param wrong_examples_fields: A list of fields to include in the worst samples.
|
||||
:param wrong_examples_fields: A list of fields to include in the worst samples. By default, "answer", "context", and "document_id" are included.
|
||||
:param max_characters_per_field: The maximum number of characters to include in the worst samples report (per field).
|
||||
"""
|
||||
if wrong_examples_fields is None:
|
||||
wrong_examples_fields = ["answer", "context", "document_id"]
|
||||
graph = DiGraph(self.graph.edges)
|
||||
print_eval_report(
|
||||
eval_result=eval_result,
|
||||
|
@ -202,7 +202,7 @@ class RayPipeline(Pipeline):
|
||||
|
||||
@classmethod
|
||||
def _create_ray_deployment(
|
||||
cls, component_name: str, pipeline_config: dict, serve_deployment_kwargs: Optional[Dict[str, Any]] = {}
|
||||
cls, component_name: str, pipeline_config: dict, serve_deployment_kwargs: Optional[Dict[str, Any]] = None
|
||||
):
|
||||
"""
|
||||
Create a Ray Deployment for the Component.
|
||||
@ -215,6 +215,8 @@ class RayPipeline(Pipeline):
|
||||
Ray Serve API docs (https://docs.ray.io/en/latest/serve/package-ref.html)
|
||||
under the `ray.serve.deployment()` method
|
||||
"""
|
||||
if serve_deployment_kwargs is None:
|
||||
serve_deployment_kwargs = {}
|
||||
RayDeployment = serve.deployment(
|
||||
_RayDeploymentWrapper, name=component_name, **serve_deployment_kwargs # type: ignore
|
||||
)
|
||||
|
@ -241,7 +241,7 @@ class BaseStandardPipeline(ABC):
|
||||
"document_id_or_answer",
|
||||
] = "document_id_or_answer",
|
||||
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
||||
wrong_examples_fields: List[str] = ["answer", "context", "document_id"],
|
||||
wrong_examples_fields: Optional[List[str]] = None,
|
||||
max_characters_per_field: int = 150,
|
||||
):
|
||||
"""
|
||||
@ -277,9 +277,11 @@ class BaseStandardPipeline(ABC):
|
||||
- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
|
||||
The default value is 'any'.
|
||||
In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
|
||||
:param wrong_examples_fields: A list of field names to include in the worst samples.
|
||||
:param wrong_examples_fields: A list of field names to include in the worst samples. By default, "answer", "context", and "document_id" are used.
|
||||
:param max_characters_per_field: The maximum number of characters per wrong example to show (per field).
|
||||
"""
|
||||
if wrong_examples_fields is None:
|
||||
wrong_examples_fields = ["answer", "context", "document_id"]
|
||||
if metrics_filter is None:
|
||||
metrics_filter = self.metrics_filter
|
||||
self.pipeline.print_eval_report(
|
||||
|
@ -178,7 +178,7 @@ def print_eval_report(
|
||||
"document_id", "context", "document_id_and_context", "document_id_or_context", "answer", "document_id_or_answer"
|
||||
] = "document_id_or_answer",
|
||||
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
||||
wrong_examples_fields: List[str] = ["answer", "context", "document_id"],
|
||||
wrong_examples_fields: Optional[List[str]] = None,
|
||||
max_characters_per_field: int = 150,
|
||||
):
|
||||
"""
|
||||
@ -216,9 +216,11 @@ def print_eval_report(
|
||||
- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
|
||||
The default value is 'any'.
|
||||
In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
|
||||
:param wrong_examples_fields: A list of field names that should be included in the wrong examples.
|
||||
:param wrong_examples_fields: A list of field names that should be included in the wrong examples. By default, "answer", "context", and "document_id" are used.
|
||||
:param max_characters_per_field: The maximum number of characters to show in the wrong examples report (per field).
|
||||
"""
|
||||
if wrong_examples_fields is None:
|
||||
wrong_examples_fields = ["answer", "context", "document_id"]
|
||||
if any(degree > 1 for node, degree in graph.out_degree):
|
||||
logger.warning("Pipelines with junctions are currently not supported.")
|
||||
return
|
||||
@ -309,9 +311,11 @@ def _format_wrong_examples_report(
|
||||
"document_id", "context", "document_id_and_context", "document_id_or_context", "answer", "document_id_or_answer"
|
||||
] = "document_id_or_answer",
|
||||
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
||||
fields: List[str] = ["answer", "context", "document_id"],
|
||||
fields: Optional[List[str]] = None,
|
||||
max_chars: int = 150,
|
||||
):
|
||||
if fields is None:
|
||||
fields = ["answer", "context", "document_id"]
|
||||
examples = {
|
||||
node: eval_result.wrong_examples(
|
||||
node, document_scope=document_scope, answer_scope=answer_scope, n=n_wrong_examples
|
||||
|
@ -302,7 +302,9 @@ class SpeechDocument(Document):
|
||||
return f"<SpeechDocument: id={self.id}, content=None>"
|
||||
return f"<SpeechDocument: id={self.id}, content='{self.content[:100]}{'...' if len(self.content) > 100 else ''}', content_audio={self.content_audio}>"
|
||||
|
||||
def to_dict(self, field_map={}) -> Dict:
|
||||
def to_dict(self, field_map=None) -> Dict:
|
||||
if field_map is None:
|
||||
field_map = {}
|
||||
dictionary = super().to_dict(field_map=field_map)
|
||||
for key, value in dictionary.items():
|
||||
if isinstance(value, Path):
|
||||
@ -310,7 +312,9 @@ class SpeechDocument(Document):
|
||||
return dictionary
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, dict, field_map={}, id_hash_keys=None):
|
||||
def from_dict(cls, dict, field_map=None, id_hash_keys=None):
|
||||
if field_map is None:
|
||||
field_map = {}
|
||||
doc = super().from_dict(dict=dict, field_map=field_map, id_hash_keys=id_hash_keys)
|
||||
doc.content_audio = Path(dict["content_audio"])
|
||||
return doc
|
||||
|
@ -133,7 +133,7 @@ def send_event(func):
|
||||
return wrapper
|
||||
|
||||
|
||||
def send_custom_event(event: str = "", payload: Dict[str, Any] = {}):
|
||||
def send_custom_event(event: str = "", payload: Optional[Dict[str, Any]] = None):
|
||||
"""
|
||||
This method can be called directly from anywhere in Haystack to send an event.
|
||||
Enriches the given event with metadata and sends it to the posthog server if telemetry is enabled.
|
||||
@ -143,6 +143,8 @@ def send_custom_event(event: str = "", payload: Dict[str, Any] = {}):
|
||||
:param payload: A dictionary containing event meta data, e.g., parameter settings
|
||||
"""
|
||||
global user_id # pylint: disable=global-statement
|
||||
if payload is None:
|
||||
payload = {}
|
||||
try:
|
||||
|
||||
def send_request(payload: Dict[str, Any]):
|
||||
|
@ -144,7 +144,7 @@ class DeepsetCloudClient:
|
||||
def post(
|
||||
self,
|
||||
url: str,
|
||||
json: dict = {},
|
||||
json: Optional[Dict] = None,
|
||||
data: Optional[Any] = None,
|
||||
query_params: Optional[dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
@ -152,6 +152,8 @@ class DeepsetCloudClient:
|
||||
files: Optional[Any] = None,
|
||||
raise_on_error: bool = True,
|
||||
):
|
||||
if json is None:
|
||||
json = {}
|
||||
return self._execute_request(
|
||||
method="POST",
|
||||
url=url,
|
||||
@ -167,7 +169,7 @@ class DeepsetCloudClient:
|
||||
def post_with_auto_paging(
|
||||
self,
|
||||
url: str,
|
||||
json: dict = {},
|
||||
json: Optional[Dict] = None,
|
||||
data: Optional[Any] = None,
|
||||
query_params: Optional[dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
@ -175,6 +177,8 @@ class DeepsetCloudClient:
|
||||
raise_on_error: bool = True,
|
||||
auto_paging_page_size: Optional[int] = None,
|
||||
):
|
||||
if json is None:
|
||||
json = {}
|
||||
return self._execute_auto_paging_request(
|
||||
method="POST",
|
||||
url=url,
|
||||
@ -211,7 +215,7 @@ class DeepsetCloudClient:
|
||||
def put_with_auto_paging(
|
||||
self,
|
||||
url: str,
|
||||
json: dict = {},
|
||||
json: Optional[Dict] = None,
|
||||
data: Optional[Any] = None,
|
||||
query_params: Optional[dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
@ -219,6 +223,8 @@ class DeepsetCloudClient:
|
||||
raise_on_error: bool = True,
|
||||
auto_paging_page_size: Optional[int] = None,
|
||||
):
|
||||
if json is None:
|
||||
json = {}
|
||||
return self._execute_auto_paging_request(
|
||||
method="PUT",
|
||||
url=url,
|
||||
|
@ -278,7 +278,6 @@ disable = [
|
||||
"unspecified-encoding",
|
||||
"unidiomatic-typecheck",
|
||||
"no-name-in-module",
|
||||
"dangerous-default-value",
|
||||
"consider-using-with",
|
||||
"redefined-outer-name",
|
||||
"arguments-renamed",
|
||||
|
Loading…
x
Reference in New Issue
Block a user