mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-09-25 16:15:35 +00:00
refactor: replace mutable default arguments (#4070)
* refactor: replace mutable default arguments * change type annotation in BasePreProcessor to Optional[List]
This commit is contained in:
parent
3273a2714d
commit
0e282e5ca4
@ -49,7 +49,7 @@ class InMemoryDocumentStore(KeywordDocumentStore):
|
|||||||
use_bm25: bool = False,
|
use_bm25: bool = False,
|
||||||
bm25_tokenization_regex: str = r"(?u)\b\w\w+\b",
|
bm25_tokenization_regex: str = r"(?u)\b\w\w+\b",
|
||||||
bm25_algorithm: Literal["BM25Okapi", "BM25L", "BM25Plus"] = "BM25Okapi",
|
bm25_algorithm: Literal["BM25Okapi", "BM25L", "BM25Plus"] = "BM25Okapi",
|
||||||
bm25_parameters: dict = {},
|
bm25_parameters: Optional[Dict] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
:param index: The documents are scoped to an index attribute that can be used when writing, querying,
|
:param index: The documents are scoped to an index attribute that can be used when writing, querying,
|
||||||
@ -87,7 +87,10 @@ class InMemoryDocumentStore(KeywordDocumentStore):
|
|||||||
:param bm25_parameters: Parameters for BM25 implementation in a dictionary format.
|
:param bm25_parameters: Parameters for BM25 implementation in a dictionary format.
|
||||||
For example: {'k1':1.5, 'b':0.75, 'epsilon':0.25}
|
For example: {'k1':1.5, 'b':0.75, 'epsilon':0.25}
|
||||||
You can learn more about these parameters by visiting https://github.com/dorianbrown/rank_bm25
|
You can learn more about these parameters by visiting https://github.com/dorianbrown/rank_bm25
|
||||||
|
By default, no parameters are set.
|
||||||
"""
|
"""
|
||||||
|
if bm25_parameters is None:
|
||||||
|
bm25_parameters = {}
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.indexes: Dict[str, Dict] = defaultdict(dict)
|
self.indexes: Dict[str, Dict] = defaultdict(dict)
|
||||||
|
@ -68,7 +68,7 @@ class PineconeDocumentStore(BaseDocumentStore):
|
|||||||
progress_bar: bool = True,
|
progress_bar: bool = True,
|
||||||
duplicate_documents: str = "overwrite",
|
duplicate_documents: str = "overwrite",
|
||||||
recreate_index: bool = False,
|
recreate_index: bool = False,
|
||||||
metadata_config: dict = {"indexed": []},
|
metadata_config: Optional[Dict] = None,
|
||||||
validate_index_sync: bool = True,
|
validate_index_sync: bool = True,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@ -106,6 +106,8 @@ class PineconeDocumentStore(BaseDocumentStore):
|
|||||||
Should be in the format `{"indexed": ["metadata-field-1", "metadata-field-2", "metadata-field-n"]}`. By default,
|
Should be in the format `{"indexed": ["metadata-field-1", "metadata-field-2", "metadata-field-n"]}`. By default,
|
||||||
no fields are indexed.
|
no fields are indexed.
|
||||||
"""
|
"""
|
||||||
|
if metadata_config is None:
|
||||||
|
metadata_config = {"indexed": []}
|
||||||
# Connect to Pinecone server using python client binding
|
# Connect to Pinecone server using python client binding
|
||||||
if not api_key:
|
if not api_key:
|
||||||
raise PineconeDocumentStoreError(
|
raise PineconeDocumentStoreError(
|
||||||
@ -201,12 +203,14 @@ class PineconeDocumentStore(BaseDocumentStore):
|
|||||||
replicas: Optional[int] = 1,
|
replicas: Optional[int] = 1,
|
||||||
shards: Optional[int] = 1,
|
shards: Optional[int] = 1,
|
||||||
recreate_index: bool = False,
|
recreate_index: bool = False,
|
||||||
metadata_config: dict = {"indexed": []},
|
metadata_config: Optional[Dict] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Create a new index for storing documents in case an
|
Create a new index for storing documents in case an
|
||||||
index with the name doesn't exist already.
|
index with the name doesn't exist already.
|
||||||
"""
|
"""
|
||||||
|
if metadata_config is None:
|
||||||
|
metadata_config = {"indexed": []}
|
||||||
index = self._index_name(index)
|
index = self._index_name(index)
|
||||||
|
|
||||||
if recreate_index:
|
if recreate_index:
|
||||||
|
@ -555,7 +555,7 @@ class DataSiloForCrossVal:
|
|||||||
def make(
|
def make(
|
||||||
cls,
|
cls,
|
||||||
datasilo: DataSilo,
|
datasilo: DataSilo,
|
||||||
sets: List[str] = ["train", "dev", "test"],
|
sets: Optional[List[str]] = None,
|
||||||
n_splits: int = 5,
|
n_splits: int = 5,
|
||||||
shuffle: bool = True,
|
shuffle: bool = True,
|
||||||
random_state: Optional[int] = None,
|
random_state: Optional[int] = None,
|
||||||
@ -568,7 +568,7 @@ class DataSiloForCrossVal:
|
|||||||
original data silo passed on.
|
original data silo passed on.
|
||||||
|
|
||||||
:param datasilo: The data silo that contains the original data.
|
:param datasilo: The data silo that contains the original data.
|
||||||
:param sets: Which sets to use to create the xval folds (strings)
|
:param sets: Which sets to use to create the xval folds (strings). By default, "train", "dev", and "test" are used.
|
||||||
:param n_splits: number of folds to create
|
:param n_splits: number of folds to create
|
||||||
:param shuffle: shuffle each class' samples before splitting
|
:param shuffle: shuffle each class' samples before splitting
|
||||||
:param random_state: random state for shuffling
|
:param random_state: random state for shuffling
|
||||||
@ -576,6 +576,8 @@ class DataSiloForCrossVal:
|
|||||||
It is never done with question answering.
|
It is never done with question answering.
|
||||||
:param n_neg_answers_per_question: number of negative answers per question to include for training
|
:param n_neg_answers_per_question: number of negative answers per question to include for training
|
||||||
"""
|
"""
|
||||||
|
if sets is None:
|
||||||
|
sets = ["train", "dev", "test"]
|
||||||
if "question_answering" in datasilo.processor.tasks and n_inner_splits is None: # type: ignore
|
if "question_answering" in datasilo.processor.tasks and n_inner_splits is None: # type: ignore
|
||||||
return cls._make_question_answering(
|
return cls._make_question_answering(
|
||||||
datasilo, sets, n_splits, shuffle, random_state, n_neg_answers_per_question
|
datasilo, sets, n_splits, shuffle, random_state, n_neg_answers_per_question
|
||||||
@ -588,7 +590,7 @@ class DataSiloForCrossVal:
|
|||||||
def _make_question_answering(
|
def _make_question_answering(
|
||||||
cls,
|
cls,
|
||||||
datasilo: DataSilo,
|
datasilo: DataSilo,
|
||||||
sets: List[str] = ["train", "dev", "test"],
|
sets: Optional[List[str]] = None,
|
||||||
n_splits: int = 5,
|
n_splits: int = 5,
|
||||||
shuffle: bool = True,
|
shuffle: bool = True,
|
||||||
random_state: Optional[int] = None,
|
random_state: Optional[int] = None,
|
||||||
@ -600,12 +602,14 @@ class DataSiloForCrossVal:
|
|||||||
data for question-answering-
|
data for question-answering-
|
||||||
|
|
||||||
:param datasilo: The data silo that contains the original data.
|
:param datasilo: The data silo that contains the original data.
|
||||||
:param sets: Which sets to use to create the xval folds (strings).
|
:param sets: Which sets to use to create the xval folds (strings). By default, "train", "dev", and "test" are used.
|
||||||
:param n_splits: Number of folds to create.
|
:param n_splits: Number of folds to create.
|
||||||
:param shuffle: Shuffle each class' samples before splitting.
|
:param shuffle: Shuffle each class' samples before splitting.
|
||||||
:param random_state: Random state for shuffling.
|
:param random_state: Random state for shuffling.
|
||||||
:param n_neg_answers_per_question: Number of negative answers per question to include for training.
|
:param n_neg_answers_per_question: Number of negative answers per question to include for training.
|
||||||
"""
|
"""
|
||||||
|
if sets is None:
|
||||||
|
sets = ["train", "dev", "test"]
|
||||||
assert "id" in datasilo.tensor_names, f"Expected tensor 'id' in tensor names, found {datasilo.tensor_names}" # type: ignore
|
assert "id" in datasilo.tensor_names, f"Expected tensor 'id' in tensor names, found {datasilo.tensor_names}" # type: ignore
|
||||||
assert "labels" in datasilo.tensor_names, f"Expected tensor 'labels' in tensor names, found {datasilo.tensor_names}" # type: ignore
|
assert "labels" in datasilo.tensor_names, f"Expected tensor 'labels' in tensor names, found {datasilo.tensor_names}" # type: ignore
|
||||||
|
|
||||||
|
@ -59,7 +59,7 @@ class Processor(ABC):
|
|||||||
test_filename: Optional[Union[Path, str]],
|
test_filename: Optional[Union[Path, str]],
|
||||||
dev_split: float,
|
dev_split: float,
|
||||||
data_dir: Optional[Union[Path, str]],
|
data_dir: Optional[Union[Path, str]],
|
||||||
tasks: Dict = {},
|
tasks: Optional[Dict] = None,
|
||||||
proxies: Optional[Dict] = None,
|
proxies: Optional[Dict] = None,
|
||||||
multithreading_rust: Optional[bool] = True,
|
multithreading_rust: Optional[bool] = True,
|
||||||
):
|
):
|
||||||
@ -82,6 +82,8 @@ class Processor(ABC):
|
|||||||
Note: Enabling multithreading in Rust AND multiprocessing in python might cause
|
Note: Enabling multithreading in Rust AND multiprocessing in python might cause
|
||||||
deadlocks.
|
deadlocks.
|
||||||
"""
|
"""
|
||||||
|
if tasks is None:
|
||||||
|
tasks = {}
|
||||||
if not multithreading_rust:
|
if not multithreading_rust:
|
||||||
os.environ["RAYON_RS_NUM_CPUS"] = "1"
|
os.environ["RAYON_RS_NUM_CPUS"] = "1"
|
||||||
|
|
||||||
@ -313,7 +315,7 @@ class Processor(ABC):
|
|||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def dataset_from_dicts(
|
def dataset_from_dicts(
|
||||||
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
|
self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False
|
||||||
):
|
):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
@ -444,7 +446,7 @@ class SquadProcessor(Processor):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def dataset_from_dicts(
|
def dataset_from_dicts(
|
||||||
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
|
self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Convert input dictionaries into a pytorch dataset for Question Answering.
|
Convert input dictionaries into a pytorch dataset for Question Answering.
|
||||||
@ -456,6 +458,8 @@ class SquadProcessor(Processor):
|
|||||||
:param indices: list, indices used during multiprocessing so that IDs assigned to our baskets is unique
|
:param indices: list, indices used during multiprocessing so that IDs assigned to our baskets is unique
|
||||||
:param return_baskets: boolean, whether to return the baskets or not (baskets are needed during inference)
|
:param return_baskets: boolean, whether to return the baskets or not (baskets are needed during inference)
|
||||||
"""
|
"""
|
||||||
|
if indices is None:
|
||||||
|
indices = []
|
||||||
# Convert to standard format
|
# Convert to standard format
|
||||||
pre_baskets = [self.convert_qa_input_dict(x) for x in dicts] # TODO move to input object conversion
|
pre_baskets = [self.convert_qa_input_dict(x) for x in dicts] # TODO move to input object conversion
|
||||||
|
|
||||||
@ -990,7 +994,7 @@ class TextSimilarityProcessor(Processor):
|
|||||||
json.dump(config, file)
|
json.dump(config, file)
|
||||||
|
|
||||||
def dataset_from_dicts(
|
def dataset_from_dicts(
|
||||||
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
|
self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Convert input dictionaries into a pytorch dataset for TextSimilarity (e.g. DPR).
|
Convert input dictionaries into a pytorch dataset for TextSimilarity (e.g. DPR).
|
||||||
@ -1013,6 +1017,8 @@ class TextSimilarityProcessor(Processor):
|
|||||||
:param return_baskets: whether to return the baskets or not (baskets are needed during inference)
|
:param return_baskets: whether to return the baskets or not (baskets are needed during inference)
|
||||||
:return: dataset, tensor_names, problematic_ids, [baskets]
|
:return: dataset, tensor_names, problematic_ids, [baskets]
|
||||||
"""
|
"""
|
||||||
|
if indices is None:
|
||||||
|
indices = []
|
||||||
# Take the dict and insert into our basket structure, this stages also adds an internal IDs
|
# Take the dict and insert into our basket structure, this stages also adds an internal IDs
|
||||||
baskets = self._fill_baskets(dicts, indices)
|
baskets = self._fill_baskets(dicts, indices)
|
||||||
|
|
||||||
@ -1254,7 +1260,7 @@ class TableTextSimilarityProcessor(Processor):
|
|||||||
dev_split: float = 0.1,
|
dev_split: float = 0.1,
|
||||||
proxies: Optional[Dict] = None,
|
proxies: Optional[Dict] = None,
|
||||||
max_samples: Optional[int] = None,
|
max_samples: Optional[int] = None,
|
||||||
embed_meta_fields: List[str] = ["page_title", "section_title", "caption"],
|
embed_meta_fields: Optional[List[str]] = None,
|
||||||
num_positives: int = 1,
|
num_positives: int = 1,
|
||||||
num_hard_negatives: int = 1,
|
num_hard_negatives: int = 1,
|
||||||
shuffle_negatives: bool = True,
|
shuffle_negatives: bool = True,
|
||||||
@ -1284,7 +1290,7 @@ class TableTextSimilarityProcessor(Processor):
|
|||||||
:param proxies: Proxy configuration to allow downloads of remote datasets.
|
:param proxies: Proxy configuration to allow downloads of remote datasets.
|
||||||
Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
|
Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
|
||||||
:param max_samples: maximum number of samples to use.
|
:param max_samples: maximum number of samples to use.
|
||||||
:param embed_meta_fields: List of meta fields to embed in text passages and tables during tensorization.
|
:param embed_meta_fields: List of meta fields to embed in text passages and tables during tensorization. By default, "page_title", "section_title", and "caption" are used.
|
||||||
:param num_hard_negatives: Maximum number of hard negative context passages in a sample.
|
:param num_hard_negatives: Maximum number of hard negative context passages in a sample.
|
||||||
:param num_positives: Maximum number of positive context passages in a sample.
|
:param num_positives: Maximum number of positive context passages in a sample.
|
||||||
:param shuffle_negatives: Whether to shuffle all the hard_negative passages before selecting the
|
:param shuffle_negatives: Whether to shuffle all the hard_negative passages before selecting the
|
||||||
@ -1296,6 +1302,8 @@ class TableTextSimilarityProcessor(Processor):
|
|||||||
"""
|
"""
|
||||||
# TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs
|
# TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs
|
||||||
|
|
||||||
|
if embed_meta_fields is None:
|
||||||
|
embed_meta_fields = ["page_title", "section_title", "caption"]
|
||||||
# Custom processor attributes
|
# Custom processor attributes
|
||||||
self.max_samples = max_samples
|
self.max_samples = max_samples
|
||||||
self.query_tokenizer = query_tokenizer
|
self.query_tokenizer = query_tokenizer
|
||||||
@ -1511,7 +1519,7 @@ class TableTextSimilarityProcessor(Processor):
|
|||||||
return standard_dicts
|
return standard_dicts
|
||||||
|
|
||||||
def dataset_from_dicts(
|
def dataset_from_dicts(
|
||||||
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
|
self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Convert input dictionaries into a pytorch dataset for TextSimilarity.
|
Convert input dictionaries into a pytorch dataset for TextSimilarity.
|
||||||
@ -1533,7 +1541,8 @@ class TableTextSimilarityProcessor(Processor):
|
|||||||
:param indices: list, indices used during multiprocessing so that IDs assigned to our baskets is unique
|
:param indices: list, indices used during multiprocessing so that IDs assigned to our baskets is unique
|
||||||
:param return_baskets: boolean, whether to return the baskets or not (baskets are needed during inference)
|
:param return_baskets: boolean, whether to return the baskets or not (baskets are needed during inference)
|
||||||
"""
|
"""
|
||||||
|
if indices is None:
|
||||||
|
indices = []
|
||||||
# Take the dict and insert into our basket structure, this stages also adds an internal IDs
|
# Take the dict and insert into our basket structure, this stages also adds an internal IDs
|
||||||
baskets = self._fill_baskets(dicts, indices)
|
baskets = self._fill_baskets(dicts, indices)
|
||||||
|
|
||||||
@ -1861,8 +1870,10 @@ class TextClassificationProcessor(Processor):
|
|||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def dataset_from_dicts(
|
def dataset_from_dicts(
|
||||||
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
|
self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False
|
||||||
):
|
):
|
||||||
|
if indices is None:
|
||||||
|
indices = []
|
||||||
baskets = []
|
baskets = []
|
||||||
# Tokenize in batches
|
# Tokenize in batches
|
||||||
texts = [x["text"] for x in dicts]
|
texts = [x["text"] for x in dicts]
|
||||||
@ -2043,10 +2054,12 @@ class UnlabeledTextProcessor(Processor):
|
|||||||
test_filename: Optional[Union[Path, str]] = None,
|
test_filename: Optional[Union[Path, str]] = None,
|
||||||
dev_split: float = 0,
|
dev_split: float = 0,
|
||||||
data_dir: Optional[Union[Path, str]] = None,
|
data_dir: Optional[Union[Path, str]] = None,
|
||||||
tasks: Dict = {},
|
tasks: Optional[Dict] = None,
|
||||||
proxies: Optional[Dict] = None,
|
proxies: Optional[Dict] = None,
|
||||||
multithreading_rust: Optional[bool] = True,
|
multithreading_rust: Optional[bool] = True,
|
||||||
):
|
):
|
||||||
|
if tasks is None:
|
||||||
|
tasks = {}
|
||||||
super().__init__(
|
super().__init__(
|
||||||
tokenizer,
|
tokenizer,
|
||||||
max_seq_len,
|
max_seq_len,
|
||||||
@ -2069,8 +2082,10 @@ class UnlabeledTextProcessor(Processor):
|
|||||||
return dicts
|
return dicts
|
||||||
|
|
||||||
def dataset_from_dicts(
|
def dataset_from_dicts(
|
||||||
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
|
self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False
|
||||||
):
|
):
|
||||||
|
if indices is None:
|
||||||
|
indices = []
|
||||||
if return_baskets:
|
if return_baskets:
|
||||||
raise NotImplementedError("return_baskets is not supported by UnlabeledTextProcessor")
|
raise NotImplementedError("return_baskets is not supported by UnlabeledTextProcessor")
|
||||||
texts = [dict_["text"] for dict_ in dicts]
|
texts = [dict_["text"] for dict_ in dicts]
|
||||||
|
@ -42,8 +42,8 @@ class BiAdaptiveModel(nn.Module):
|
|||||||
prediction_heads: List[PredictionHead],
|
prediction_heads: List[PredictionHead],
|
||||||
embeds_dropout_prob: float = 0.1,
|
embeds_dropout_prob: float = 0.1,
|
||||||
device: torch.device = torch.device("cuda"),
|
device: torch.device = torch.device("cuda"),
|
||||||
lm1_output_types: Union[str, List[str]] = ["per_sequence"],
|
lm1_output_types: Optional[Union[str, List[str]]] = None,
|
||||||
lm2_output_types: Union[str, List[str]] = ["per_sequence"],
|
lm2_output_types: Optional[Union[str, List[str]]] = None,
|
||||||
loss_aggregation_fn: Optional[Callable] = None,
|
loss_aggregation_fn: Optional[Callable] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@ -54,12 +54,12 @@ class BiAdaptiveModel(nn.Module):
|
|||||||
language models will be zeroed.
|
language models will be zeroed.
|
||||||
:param lm1_output_types: How to extract the embeddings from the final layer of the first language model. When set
|
:param lm1_output_types: How to extract the embeddings from the final layer of the first language model. When set
|
||||||
to "per_token", one embedding will be extracted per input token. If set to
|
to "per_token", one embedding will be extracted per input token. If set to
|
||||||
"per_sequence", a single embedding will be extracted to represent the full
|
"per_sequence" (default), a single embedding will be extracted to represent the full
|
||||||
input sequence. Can either be a single string, or a list of strings,
|
input sequence. Can either be a single string, or a list of strings,
|
||||||
one for each prediction head.
|
one for each prediction head.
|
||||||
:param lm2_output_types: How to extract the embeddings from the final layer of the second language model. When set
|
:param lm2_output_types: How to extract the embeddings from the final layer of the second language model. When set
|
||||||
to "per_token", one embedding will be extracted per input token. If set to
|
to "per_token", one embedding will be extracted per input token. If set to
|
||||||
"per_sequence", a single embedding will be extracted to represent the full
|
"per_sequence" (default), a single embedding will be extracted to represent the full
|
||||||
input sequence. Can either be a single string, or a list of strings,
|
input sequence. Can either be a single string, or a list of strings,
|
||||||
one for each prediction head.
|
one for each prediction head.
|
||||||
:param device: The device on which this model will operate. Either torch.device("cpu") or torch.device("cuda").
|
:param device: The device on which this model will operate. Either torch.device("cpu") or torch.device("cuda").
|
||||||
@ -74,6 +74,10 @@ class BiAdaptiveModel(nn.Module):
|
|||||||
Note: The loss at this stage is per sample, i.e one tensor of
|
Note: The loss at this stage is per sample, i.e one tensor of
|
||||||
shape (batchsize) per prediction head.
|
shape (batchsize) per prediction head.
|
||||||
"""
|
"""
|
||||||
|
if lm1_output_types is None:
|
||||||
|
lm1_output_types = ["per_sequence"]
|
||||||
|
if lm2_output_types is None:
|
||||||
|
lm2_output_types = ["per_sequence"]
|
||||||
super(BiAdaptiveModel, self).__init__()
|
super(BiAdaptiveModel, self).__init__()
|
||||||
|
|
||||||
self.device = device
|
self.device = device
|
||||||
|
@ -231,7 +231,7 @@ class QuestionAnsweringHead(PredictionHead):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
layer_dims: List[int] = [768, 2],
|
layer_dims: Optional[List[int]] = None,
|
||||||
task_name: str = "question_answering",
|
task_name: str = "question_answering",
|
||||||
no_ans_boost: float = 0.0,
|
no_ans_boost: float = 0.0,
|
||||||
context_window_size: int = 100,
|
context_window_size: int = 100,
|
||||||
@ -244,7 +244,7 @@ class QuestionAnsweringHead(PredictionHead):
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
:param layer_dims: dimensions of Feed Forward block, e.g. [768,2], for adjusting to BERT embedding. Output should be always 2
|
:param layer_dims: dimensions of Feed Forward block, e.g. [768,2] used by default, for adjusting to BERT embedding. Output should be always 2
|
||||||
:param kwargs: placeholder for passing generic parameters
|
:param kwargs: placeholder for passing generic parameters
|
||||||
:param no_ans_boost: How much the no_answer logit is boosted/increased.
|
:param no_ans_boost: How much the no_answer logit is boosted/increased.
|
||||||
The higher the value, the more likely a "no answer possible given the input text" is returned by the model
|
The higher the value, the more likely a "no answer possible given the input text" is returned by the model
|
||||||
@ -260,6 +260,8 @@ class QuestionAnsweringHead(PredictionHead):
|
|||||||
:param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence.
|
:param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence.
|
||||||
Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default).
|
Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default).
|
||||||
"""
|
"""
|
||||||
|
if layer_dims is None:
|
||||||
|
layer_dims = [768, 2]
|
||||||
super(QuestionAnsweringHead, self).__init__()
|
super(QuestionAnsweringHead, self).__init__()
|
||||||
if len(kwargs) > 0:
|
if len(kwargs) > 0:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
|
@ -248,7 +248,7 @@ class QAPred(Pred):
|
|||||||
aggregation_level: str,
|
aggregation_level: str,
|
||||||
no_answer_gap: float,
|
no_answer_gap: float,
|
||||||
ground_truth_answer: Optional[str] = None,
|
ground_truth_answer: Optional[str] = None,
|
||||||
answer_types: List[str] = [],
|
answer_types: Optional[List[str]] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
:param id: The id of the passage or document
|
:param id: The id of the passage or document
|
||||||
@ -262,6 +262,8 @@ class QAPred(Pred):
|
|||||||
:param ground_truth_answer: Ground truth answers
|
:param ground_truth_answer: Ground truth answers
|
||||||
:param answer_types: List of answer_types supported by this task e.g. ["span", "yes_no", "no_answer"]
|
:param answer_types: List of answer_types supported by this task e.g. ["span", "yes_no", "no_answer"]
|
||||||
"""
|
"""
|
||||||
|
if answer_types is None:
|
||||||
|
answer_types = []
|
||||||
super().__init__(id, prediction, context)
|
super().__init__(id, prediction, context)
|
||||||
self.question = question
|
self.question = question
|
||||||
self.token_offsets = token_offsets
|
self.token_offsets = token_offsets
|
||||||
|
@ -44,9 +44,9 @@ class TriAdaptiveModel(nn.Module):
|
|||||||
prediction_heads: List[PredictionHead],
|
prediction_heads: List[PredictionHead],
|
||||||
embeds_dropout_prob: float = 0.1,
|
embeds_dropout_prob: float = 0.1,
|
||||||
device: torch.device = torch.device("cuda"),
|
device: torch.device = torch.device("cuda"),
|
||||||
lm1_output_types: Union[str, List[str]] = ["per_sequence"],
|
lm1_output_types: Optional[Union[str, List[str]]] = None,
|
||||||
lm2_output_types: Union[str, List[str]] = ["per_sequence"],
|
lm2_output_types: Optional[Union[str, List[str]]] = None,
|
||||||
lm3_output_types: Union[str, List[str]] = ["per_sequence"],
|
lm3_output_types: Optional[Union[str, List[str]]] = None,
|
||||||
loss_aggregation_fn: Optional[Callable] = None,
|
loss_aggregation_fn: Optional[Callable] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@ -58,17 +58,17 @@ class TriAdaptiveModel(nn.Module):
|
|||||||
language model will be zeroed.
|
language model will be zeroed.
|
||||||
:param lm1_output_types: How to extract the embeddings from the final layer of the first language model. When set
|
:param lm1_output_types: How to extract the embeddings from the final layer of the first language model. When set
|
||||||
to "per_token", one embedding will be extracted per input token. If set to
|
to "per_token", one embedding will be extracted per input token. If set to
|
||||||
"per_sequence", a single embedding will be extracted to represent the full
|
"per_sequence" (default), a single embedding will be extracted to represent the full
|
||||||
input sequence. Can either be a single string, or a list of strings,
|
input sequence. Can either be a single string, or a list of strings,
|
||||||
one for each prediction head.
|
one for each prediction head.
|
||||||
:param lm2_output_types: How to extract the embeddings from the final layer of the second language model. When set
|
:param lm2_output_types: How to extract the embeddings from the final layer of the second language model. When set
|
||||||
to "per_token", one embedding will be extracted per input token. If set to
|
to "per_token", one embedding will be extracted per input token. If set to
|
||||||
"per_sequence", a single embedding will be extracted to represent the full
|
"per_sequence" (default), a single embedding will be extracted to represent the full
|
||||||
input sequence. Can either be a single string, or a list of strings,
|
input sequence. Can either be a single string, or a list of strings,
|
||||||
one for each prediction head.
|
one for each prediction head.
|
||||||
:param lm3_output_types: How to extract the embeddings from the final layer of the third language model. When set
|
:param lm3_output_types: How to extract the embeddings from the final layer of the third language model. When set
|
||||||
to "per_token", one embedding will be extracted per input token. If set to
|
to "per_token", one embedding will be extracted per input token. If set to
|
||||||
"per_sequence", a single embedding will be extracted to represent the full
|
"per_sequence" (default), a single embedding will be extracted to represent the full
|
||||||
input sequence. Can either be a single string, or a list of strings,
|
input sequence. Can either be a single string, or a list of strings,
|
||||||
one for each prediction head.
|
one for each prediction head.
|
||||||
:param device: The device on which this model will operate. Either torch.device("cpu") or torch.device("cuda").
|
:param device: The device on which this model will operate. Either torch.device("cpu") or torch.device("cuda").
|
||||||
@ -83,7 +83,12 @@ class TriAdaptiveModel(nn.Module):
|
|||||||
Note: The loss at this stage is per sample, i.e one tensor of
|
Note: The loss at this stage is per sample, i.e one tensor of
|
||||||
shape (batchsize) per prediction head.
|
shape (batchsize) per prediction head.
|
||||||
"""
|
"""
|
||||||
|
if lm1_output_types is None:
|
||||||
|
lm1_output_types = ["per_sequence"]
|
||||||
|
if lm2_output_types is None:
|
||||||
|
lm2_output_types = ["per_sequence"]
|
||||||
|
if lm3_output_types is None:
|
||||||
|
lm3_output_types = ["per_sequence"]
|
||||||
super(TriAdaptiveModel, self).__init__()
|
super(TriAdaptiveModel, self).__init__()
|
||||||
self.device = device
|
self.device = device
|
||||||
self.language_model1 = language_model1.to(device)
|
self.language_model1 = language_model1.to(device)
|
||||||
|
@ -261,10 +261,12 @@ def create_schema_for_node_class(node_class: Type[BaseComponent]) -> Tuple[Dict[
|
|||||||
return component_schema, {"$ref": f"#/definitions/{component_name}"}
|
return component_schema, {"$ref": f"#/definitions/{component_name}"}
|
||||||
|
|
||||||
|
|
||||||
def get_json_schema(filename: str, version: str, modules: List[str] = ["haystack.document_stores", "haystack.nodes"]):
|
def get_json_schema(filename: str, version: str, modules: Optional[List[str]] = None):
|
||||||
"""
|
"""
|
||||||
Generate JSON schema for Haystack pipelines.
|
Generate JSON schema for Haystack pipelines.
|
||||||
"""
|
"""
|
||||||
|
if modules is None:
|
||||||
|
modules = ["haystack.document_stores", "haystack.nodes"]
|
||||||
schema_definitions = {} # All the schemas for the node and accessory classes
|
schema_definitions = {} # All the schemas for the node and accessory classes
|
||||||
node_refs = [] # References to the nodes only (accessory classes cannot be listed among the nodes in a config)
|
node_refs = [] # References to the nodes only (accessory classes cannot be listed among the nodes in a config)
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import mimetypes
|
import mimetypes
|
||||||
from typing import Any, Dict, List, Union
|
from typing import Any, Dict, List, Union, Optional
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -29,14 +29,16 @@ class FileTypeClassifier(BaseComponent):
|
|||||||
|
|
||||||
outgoing_edges = len(DEFAULT_TYPES)
|
outgoing_edges = len(DEFAULT_TYPES)
|
||||||
|
|
||||||
def __init__(self, supported_types: List[str] = DEFAULT_TYPES):
|
def __init__(self, supported_types: Optional[List[str]] = None):
|
||||||
"""
|
"""
|
||||||
Node that sends out files on a different output edge depending on their extension.
|
Node that sends out files on a different output edge depending on their extension.
|
||||||
|
|
||||||
:param supported_types: The file types that this node can distinguish between.
|
:param supported_types: The file types that this node can distinguish between.
|
||||||
The default values are: `txt`, `pdf`, `md`, `docx`, and `html`.
|
If no value is provided, the value created by default comprises: `txt`, `pdf`, `md`, `docx`, and `html`.
|
||||||
Lists with duplicate elements are not allowed.
|
Lists with duplicate elements are not allowed.
|
||||||
"""
|
"""
|
||||||
|
if supported_types is None:
|
||||||
|
supported_types = DEFAULT_TYPES
|
||||||
if len(set(supported_types)) != len(supported_types):
|
if len(set(supported_types)) != len(supported_types):
|
||||||
duplicates = supported_types
|
duplicates = supported_types
|
||||||
for item in set(supported_types):
|
for item in set(supported_types):
|
||||||
|
@ -137,7 +137,7 @@ class BaseConverter(BaseComponent):
|
|||||||
file_paths: Union[Path, List[Path]],
|
file_paths: Union[Path, List[Path]],
|
||||||
meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None,
|
meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None,
|
||||||
remove_numeric_tables: Optional[bool] = None,
|
remove_numeric_tables: Optional[bool] = None,
|
||||||
known_ligatures: Dict[str, str] = KNOWN_LIGATURES,
|
known_ligatures: Optional[Dict[str, str]] = None,
|
||||||
valid_languages: Optional[List[str]] = None,
|
valid_languages: Optional[List[str]] = None,
|
||||||
encoding: Optional[str] = "UTF-8",
|
encoding: Optional[str] = "UTF-8",
|
||||||
id_hash_keys: Optional[List[str]] = None,
|
id_hash_keys: Optional[List[str]] = None,
|
||||||
@ -153,12 +153,13 @@ class BaseConverter(BaseComponent):
|
|||||||
does not have table parsing capability for finding answers. However, tables
|
does not have table parsing capability for finding answers. However, tables
|
||||||
may also have long strings that could possible candidate for searching answers.
|
may also have long strings that could possible candidate for searching answers.
|
||||||
The rows containing strings are thus retained in this option.
|
The rows containing strings are thus retained in this option.
|
||||||
:param known_ligatures: Some converters tends to recognize clusters of letters as ligatures, such as "ff" (double f).
|
:param known_ligatures: Some converters tend to recognize clusters of letters as ligatures, such as "ff" (double f).
|
||||||
Such ligatures however make text hard to compare with the content of other files,
|
Such ligatures however make text hard to compare with the content of other files,
|
||||||
which are generally ligature free. Therefore we automatically find and replace the most
|
which are generally ligature free. Therefore we automatically find and replace the most
|
||||||
common ligatures with their split counterparts. The default mapping is in
|
common ligatures with their split counterparts. The default mapping is in
|
||||||
`haystack.nodes.file_converter.base.KNOWN_LIGATURES`: it is rather biased towards Latin alphabeths
|
`haystack.nodes.file_converter.base.KNOWN_LIGATURES`: it is rather biased towards Latin alphabeths
|
||||||
but excludes all ligatures that are known to be used in IPA.
|
but excludes all ligatures that are known to be used in IPA.
|
||||||
|
If no value is provided, this default is created and used.
|
||||||
You can use this parameter to provide your own set of ligatures to clean up from the documents.
|
You can use this parameter to provide your own set of ligatures to clean up from the documents.
|
||||||
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
|
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
|
||||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||||
@ -171,6 +172,8 @@ class BaseConverter(BaseComponent):
|
|||||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||||
In this case the id will be generated by using the content and the defined metadata.
|
In this case the id will be generated by using the content and the defined metadata.
|
||||||
"""
|
"""
|
||||||
|
if known_ligatures is None:
|
||||||
|
known_ligatures = KNOWN_LIGATURES
|
||||||
|
|
||||||
if isinstance(file_paths, Path):
|
if isinstance(file_paths, Path):
|
||||||
file_paths = [file_paths]
|
file_paths = [file_paths]
|
||||||
@ -206,7 +209,7 @@ class BaseConverter(BaseComponent):
|
|||||||
file_paths: Union[Path, List[Path]],
|
file_paths: Union[Path, List[Path]],
|
||||||
meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None,
|
meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None,
|
||||||
remove_numeric_tables: Optional[bool] = None,
|
remove_numeric_tables: Optional[bool] = None,
|
||||||
known_ligatures: Dict[str, str] = KNOWN_LIGATURES,
|
known_ligatures: Optional[Dict[str, str]] = None,
|
||||||
valid_languages: Optional[List[str]] = None,
|
valid_languages: Optional[List[str]] = None,
|
||||||
encoding: Optional[str] = "UTF-8",
|
encoding: Optional[str] = "UTF-8",
|
||||||
id_hash_keys: Optional[List[str]] = None,
|
id_hash_keys: Optional[List[str]] = None,
|
||||||
|
@ -24,7 +24,7 @@ class ImageToTextConverter(BaseConverter):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
remove_numeric_tables: bool = False,
|
remove_numeric_tables: bool = False,
|
||||||
valid_languages: Optional[List[str]] = ["eng"],
|
valid_languages: Optional[List[str]] = None,
|
||||||
id_hash_keys: Optional[List[str]] = None,
|
id_hash_keys: Optional[List[str]] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@ -37,7 +37,8 @@ class ImageToTextConverter(BaseConverter):
|
|||||||
(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html)
|
(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html)
|
||||||
This option can be used to add test for encoding errors. If the extracted text is
|
This option can be used to add test for encoding errors. If the extracted text is
|
||||||
not one of the valid languages, then it might likely be encoding error resulting
|
not one of the valid languages, then it might likely be encoding error resulting
|
||||||
in garbled text. Run the following line of code to check available language packs:
|
in garbled text. If no value is provided, English will be set as default.
|
||||||
|
Run the following line of code to check available language packs:
|
||||||
# List of available languages
|
# List of available languages
|
||||||
print(pytesseract.get_languages(config=''))
|
print(pytesseract.get_languages(config=''))
|
||||||
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
|
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
|
||||||
@ -45,6 +46,8 @@ class ImageToTextConverter(BaseConverter):
|
|||||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||||
In this case the id will be generated by using the content and the defined metadata.
|
In this case the id will be generated by using the content and the defined metadata.
|
||||||
"""
|
"""
|
||||||
|
if valid_languages is None:
|
||||||
|
valid_languages = ["eng"]
|
||||||
super().__init__(
|
super().__init__(
|
||||||
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
|
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
|
||||||
)
|
)
|
||||||
|
@ -208,7 +208,7 @@ class PDFToTextOCRConverter(BaseConverter):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
remove_numeric_tables: bool = False,
|
remove_numeric_tables: bool = False,
|
||||||
valid_languages: Optional[List[str]] = ["eng"],
|
valid_languages: Optional[List[str]] = None,
|
||||||
id_hash_keys: Optional[List[str]] = None,
|
id_hash_keys: Optional[List[str]] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@ -223,12 +223,14 @@ class PDFToTextOCRConverter(BaseConverter):
|
|||||||
(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
|
(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
|
||||||
This option can be used to add test for encoding errors. If the extracted text is
|
This option can be used to add test for encoding errors. If the extracted text is
|
||||||
not one of the valid languages, then it might likely be encoding error resulting
|
not one of the valid languages, then it might likely be encoding error resulting
|
||||||
in garbled text.
|
in garbled text. If no value is provided, English will be set as default.
|
||||||
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
|
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
|
||||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||||
In this case the id will be generated by using the content and the defined metadata.
|
In this case the id will be generated by using the content and the defined metadata.
|
||||||
"""
|
"""
|
||||||
|
if valid_languages is None:
|
||||||
|
valid_languages = ["eng"]
|
||||||
# init image to text instance
|
# init image to text instance
|
||||||
self.image_2_text = ImageToTextConverter(remove_numeric_tables, valid_languages)
|
self.image_2_text = ImageToTextConverter(remove_numeric_tables, valid_languages)
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ class BasePreProcessor(BaseComponent):
|
|||||||
clean_whitespace: Optional[bool] = True,
|
clean_whitespace: Optional[bool] = True,
|
||||||
clean_header_footer: Optional[bool] = False,
|
clean_header_footer: Optional[bool] = False,
|
||||||
clean_empty_lines: Optional[bool] = True,
|
clean_empty_lines: Optional[bool] = True,
|
||||||
remove_substrings: List[str] = [],
|
remove_substrings: Optional[List[str]] = None,
|
||||||
split_by: Literal["word", "sentence", "passage", None] = "word",
|
split_by: Literal["word", "sentence", "passage", None] = "word",
|
||||||
split_length: Optional[int] = 1000,
|
split_length: Optional[int] = 1000,
|
||||||
split_overlap: Optional[int] = None,
|
split_overlap: Optional[int] = None,
|
||||||
@ -41,7 +41,7 @@ class BasePreProcessor(BaseComponent):
|
|||||||
clean_whitespace: bool,
|
clean_whitespace: bool,
|
||||||
clean_header_footer: bool,
|
clean_header_footer: bool,
|
||||||
clean_empty_lines: bool,
|
clean_empty_lines: bool,
|
||||||
remove_substrings: List[str],
|
remove_substrings: Optional[List[str]],
|
||||||
) -> Document:
|
) -> Document:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@ -54,7 +54,7 @@ class PreProcessor(BasePreProcessor):
|
|||||||
clean_whitespace: bool = True,
|
clean_whitespace: bool = True,
|
||||||
clean_header_footer: bool = False,
|
clean_header_footer: bool = False,
|
||||||
clean_empty_lines: bool = True,
|
clean_empty_lines: bool = True,
|
||||||
remove_substrings: List[str] = [],
|
remove_substrings: Optional[List[str]] = None,
|
||||||
split_by: Optional[Literal["word", "sentence", "passage"]] = "word",
|
split_by: Optional[Literal["word", "sentence", "passage"]] = "word",
|
||||||
split_length: int = 200,
|
split_length: int = 200,
|
||||||
split_overlap: int = 0,
|
split_overlap: int = 0,
|
||||||
@ -73,7 +73,7 @@ class PreProcessor(BasePreProcessor):
|
|||||||
or similar.
|
or similar.
|
||||||
:param clean_whitespace: Strip whitespaces before or after each line in the text.
|
:param clean_whitespace: Strip whitespaces before or after each line in the text.
|
||||||
:param clean_empty_lines: Remove more than two empty lines in the text.
|
:param clean_empty_lines: Remove more than two empty lines in the text.
|
||||||
:param remove_substrings: Remove specified substrings from the text.
|
:param remove_substrings: Remove specified substrings from the text. If no value is provided an empty list is created by default.
|
||||||
:param split_by: Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting.
|
:param split_by: Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting.
|
||||||
:param split_length: Max. number of the above split unit (e.g. words) that are allowed in one document. For instance, if n -> 10 & split_by ->
|
:param split_length: Max. number of the above split unit (e.g. words) that are allowed in one document. For instance, if n -> 10 & split_by ->
|
||||||
"sentence", then each output document will have 10 sentences.
|
"sentence", then each output document will have 10 sentences.
|
||||||
@ -100,6 +100,8 @@ class PreProcessor(BasePreProcessor):
|
|||||||
`AzureConverter`.
|
`AzureConverter`.
|
||||||
:param max_chars_check: the maximum length a document is expected to have. Each document that is longer than max_chars_check in characters after pre-processing will raise a warning.
|
:param max_chars_check: the maximum length a document is expected to have. Each document that is longer than max_chars_check in characters after pre-processing will raise a warning.
|
||||||
"""
|
"""
|
||||||
|
if remove_substrings is None:
|
||||||
|
remove_substrings = []
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -132,7 +134,7 @@ class PreProcessor(BasePreProcessor):
|
|||||||
clean_whitespace: Optional[bool] = None,
|
clean_whitespace: Optional[bool] = None,
|
||||||
clean_header_footer: Optional[bool] = None,
|
clean_header_footer: Optional[bool] = None,
|
||||||
clean_empty_lines: Optional[bool] = None,
|
clean_empty_lines: Optional[bool] = None,
|
||||||
remove_substrings: List[str] = [],
|
remove_substrings: Optional[List[str]] = None,
|
||||||
split_by: Optional[Literal["word", "sentence", "passage"]] = None,
|
split_by: Optional[Literal["word", "sentence", "passage"]] = None,
|
||||||
split_length: Optional[int] = None,
|
split_length: Optional[int] = None,
|
||||||
split_overlap: Optional[int] = None,
|
split_overlap: Optional[int] = None,
|
||||||
@ -143,6 +145,8 @@ class PreProcessor(BasePreProcessor):
|
|||||||
"""
|
"""
|
||||||
Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents.
|
Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents.
|
||||||
"""
|
"""
|
||||||
|
if remove_substrings is None:
|
||||||
|
remove_substrings = []
|
||||||
if not isinstance(documents, list):
|
if not isinstance(documents, list):
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"Using a single Document as argument to the 'documents' parameter is deprecated. Use a list "
|
"Using a single Document as argument to the 'documents' parameter is deprecated. Use a list "
|
||||||
@ -197,14 +201,15 @@ class PreProcessor(BasePreProcessor):
|
|||||||
clean_whitespace: Optional[bool] = None,
|
clean_whitespace: Optional[bool] = None,
|
||||||
clean_header_footer: Optional[bool] = None,
|
clean_header_footer: Optional[bool] = None,
|
||||||
clean_empty_lines: Optional[bool] = None,
|
clean_empty_lines: Optional[bool] = None,
|
||||||
remove_substrings: List[str] = [],
|
remove_substrings: Optional[List[str]] = None,
|
||||||
split_by: Optional[Literal["word", "sentence", "passage"]] = None,
|
split_by: Optional[Literal["word", "sentence", "passage"]] = None,
|
||||||
split_length: Optional[int] = None,
|
split_length: Optional[int] = None,
|
||||||
split_overlap: Optional[int] = None,
|
split_overlap: Optional[int] = None,
|
||||||
split_respect_sentence_boundary: Optional[bool] = None,
|
split_respect_sentence_boundary: Optional[bool] = None,
|
||||||
id_hash_keys: Optional[List[str]] = None,
|
id_hash_keys: Optional[List[str]] = None,
|
||||||
) -> List[Document]:
|
) -> List[Document]:
|
||||||
|
if remove_substrings is None:
|
||||||
|
remove_substrings = []
|
||||||
if clean_whitespace is None:
|
if clean_whitespace is None:
|
||||||
clean_whitespace = self.clean_whitespace
|
clean_whitespace = self.clean_whitespace
|
||||||
if clean_header_footer is None:
|
if clean_header_footer is None:
|
||||||
@ -258,13 +263,15 @@ class PreProcessor(BasePreProcessor):
|
|||||||
clean_whitespace: bool,
|
clean_whitespace: bool,
|
||||||
clean_header_footer: bool,
|
clean_header_footer: bool,
|
||||||
clean_empty_lines: bool,
|
clean_empty_lines: bool,
|
||||||
remove_substrings: List[str],
|
remove_substrings: Optional[List[str]] = None,
|
||||||
id_hash_keys: Optional[List[str]] = None,
|
id_hash_keys: Optional[List[str]] = None,
|
||||||
) -> Document:
|
) -> Document:
|
||||||
"""
|
"""
|
||||||
Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
|
Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
|
||||||
and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__().
|
and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__().
|
||||||
"""
|
"""
|
||||||
|
if remove_substrings is None:
|
||||||
|
remove_substrings = []
|
||||||
if id_hash_keys is None:
|
if id_hash_keys is None:
|
||||||
id_hash_keys = self.id_hash_keys
|
id_hash_keys = self.id_hash_keys
|
||||||
|
|
||||||
|
@ -67,7 +67,7 @@ class TransformersQueryClassifier(BaseQueryClassifier):
|
|||||||
tokenizer: Optional[str] = None,
|
tokenizer: Optional[str] = None,
|
||||||
use_gpu: bool = True,
|
use_gpu: bool = True,
|
||||||
task: str = "text-classification",
|
task: str = "text-classification",
|
||||||
labels: List[str] = DEFAULT_LABELS,
|
labels: Optional[List[str]] = None,
|
||||||
batch_size: int = 16,
|
batch_size: int = 16,
|
||||||
progress_bar: bool = True,
|
progress_bar: bool = True,
|
||||||
use_auth_token: Optional[Union[str, bool]] = None,
|
use_auth_token: Optional[Union[str, bool]] = None,
|
||||||
@ -96,6 +96,8 @@ class TransformersQueryClassifier(BaseQueryClassifier):
|
|||||||
[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
|
[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
|
||||||
parameter is not used and a single cpu device is used for inference.
|
parameter is not used and a single cpu device is used for inference.
|
||||||
"""
|
"""
|
||||||
|
if labels is None:
|
||||||
|
labels = DEFAULT_LABELS
|
||||||
super().__init__()
|
super().__init__()
|
||||||
resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||||
if len(resolved_devices) > 1:
|
if len(resolved_devices) > 1:
|
||||||
|
@ -176,7 +176,7 @@ class FARMReader(BaseReader):
|
|||||||
dev_filename: Optional[str] = None,
|
dev_filename: Optional[str] = None,
|
||||||
test_filename: Optional[str] = None,
|
test_filename: Optional[str] = None,
|
||||||
use_gpu: Optional[bool] = None,
|
use_gpu: Optional[bool] = None,
|
||||||
devices: List[torch.device] = [],
|
devices: Optional[List[torch.device]] = None,
|
||||||
batch_size: int = 10,
|
batch_size: int = 10,
|
||||||
n_epochs: int = 2,
|
n_epochs: int = 2,
|
||||||
learning_rate: float = 1e-5,
|
learning_rate: float = 1e-5,
|
||||||
@ -205,6 +205,8 @@ class FARMReader(BaseReader):
|
|||||||
doc_stride: Optional[int] = None,
|
doc_stride: Optional[int] = None,
|
||||||
max_query_length: Optional[int] = None,
|
max_query_length: Optional[int] = None,
|
||||||
):
|
):
|
||||||
|
if devices is None:
|
||||||
|
devices = []
|
||||||
if dev_filename:
|
if dev_filename:
|
||||||
dev_split = 0
|
dev_split = 0
|
||||||
|
|
||||||
@ -363,7 +365,7 @@ class FARMReader(BaseReader):
|
|||||||
dev_filename: Optional[str] = None,
|
dev_filename: Optional[str] = None,
|
||||||
test_filename: Optional[str] = None,
|
test_filename: Optional[str] = None,
|
||||||
use_gpu: Optional[bool] = None,
|
use_gpu: Optional[bool] = None,
|
||||||
devices: List[torch.device] = [],
|
devices: Optional[List[torch.device]] = None,
|
||||||
batch_size: int = 10,
|
batch_size: int = 10,
|
||||||
n_epochs: int = 2,
|
n_epochs: int = 2,
|
||||||
learning_rate: float = 1e-5,
|
learning_rate: float = 1e-5,
|
||||||
@ -469,7 +471,7 @@ class FARMReader(BaseReader):
|
|||||||
dev_filename: Optional[str] = None,
|
dev_filename: Optional[str] = None,
|
||||||
test_filename: Optional[str] = None,
|
test_filename: Optional[str] = None,
|
||||||
use_gpu: Optional[bool] = None,
|
use_gpu: Optional[bool] = None,
|
||||||
devices: List[torch.device] = [],
|
devices: Optional[List[torch.device]] = None,
|
||||||
batch_size: int = 10,
|
batch_size: int = 10,
|
||||||
teacher_batch_size: Optional[int] = None,
|
teacher_batch_size: Optional[int] = None,
|
||||||
n_epochs: int = 2,
|
n_epochs: int = 2,
|
||||||
@ -595,7 +597,7 @@ class FARMReader(BaseReader):
|
|||||||
dev_filename: Optional[str] = None,
|
dev_filename: Optional[str] = None,
|
||||||
test_filename: Optional[str] = None,
|
test_filename: Optional[str] = None,
|
||||||
use_gpu: Optional[bool] = None,
|
use_gpu: Optional[bool] = None,
|
||||||
devices: List[torch.device] = [],
|
devices: Optional[List[torch.device]] = None,
|
||||||
batch_size: int = 10,
|
batch_size: int = 10,
|
||||||
teacher_batch_size: Optional[int] = None,
|
teacher_batch_size: Optional[int] = None,
|
||||||
n_epochs: int = 5,
|
n_epochs: int = 5,
|
||||||
|
@ -794,7 +794,7 @@ class TableTextRetriever(DenseRetriever):
|
|||||||
top_k: int = 10,
|
top_k: int = 10,
|
||||||
use_gpu: bool = True,
|
use_gpu: bool = True,
|
||||||
batch_size: int = 16,
|
batch_size: int = 16,
|
||||||
embed_meta_fields: List[str] = ["name", "section_title", "caption"],
|
embed_meta_fields: Optional[List[str]] = None,
|
||||||
use_fast_tokenizers: bool = True,
|
use_fast_tokenizers: bool = True,
|
||||||
similarity_function: str = "dot_product",
|
similarity_function: str = "dot_product",
|
||||||
global_loss_buffer_size: int = 150000,
|
global_loss_buffer_size: int = 150000,
|
||||||
@ -825,7 +825,8 @@ class TableTextRetriever(DenseRetriever):
|
|||||||
then used to create the embedding.
|
then used to create the embedding.
|
||||||
This is the approach used in the original paper and is likely to improve
|
This is the approach used in the original paper and is likely to improve
|
||||||
performance if your titles contain meaningful information for retrieval
|
performance if your titles contain meaningful information for retrieval
|
||||||
(topic, entities etc.).
|
(topic, entities etc.). If no value is provided, a default will be created.
|
||||||
|
That default embeds name, section title and caption.
|
||||||
:param use_fast_tokenizers: Whether to use fast Rust tokenizers
|
:param use_fast_tokenizers: Whether to use fast Rust tokenizers
|
||||||
:param similarity_function: Which function to apply for calculating the similarity of query and passage embeddings during training.
|
:param similarity_function: Which function to apply for calculating the similarity of query and passage embeddings during training.
|
||||||
Options: `dot_product` (Default) or `cosine`
|
Options: `dot_product` (Default) or `cosine`
|
||||||
@ -849,6 +850,8 @@ class TableTextRetriever(DenseRetriever):
|
|||||||
Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
|
Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
|
||||||
:param use_fast: Whether to use the fast version of DPR tokenizers or fallback to the standard version. Defaults to True.
|
:param use_fast: Whether to use the fast version of DPR tokenizers or fallback to the standard version. Defaults to True.
|
||||||
"""
|
"""
|
||||||
|
if embed_meta_fields is None:
|
||||||
|
embed_meta_fields = ["name", "section_title", "caption"]
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=True)
|
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=True)
|
||||||
@ -1225,7 +1228,7 @@ class TableTextRetriever(DenseRetriever):
|
|||||||
max_processes: int = 128,
|
max_processes: int = 128,
|
||||||
dev_split: float = 0,
|
dev_split: float = 0,
|
||||||
batch_size: int = 2,
|
batch_size: int = 2,
|
||||||
embed_meta_fields: List[str] = ["page_title", "section_title", "caption"],
|
embed_meta_fields: Optional[List[str]] = None,
|
||||||
num_hard_negatives: int = 1,
|
num_hard_negatives: int = 1,
|
||||||
num_positives: int = 1,
|
num_positives: int = 1,
|
||||||
n_epochs: int = 3,
|
n_epochs: int = 3,
|
||||||
@ -1260,7 +1263,7 @@ class TableTextRetriever(DenseRetriever):
|
|||||||
:param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None.
|
:param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None.
|
||||||
:param batch_size: Total number of samples in 1 batch of data.
|
:param batch_size: Total number of samples in 1 batch of data.
|
||||||
:param embed_meta_fields: Concatenate meta fields with each passage and table.
|
:param embed_meta_fields: Concatenate meta fields with each passage and table.
|
||||||
The default setting in official MMRetrieval embeds page title,
|
If no value is provided, a default will be created. That default embeds page title,
|
||||||
section title and caption with the corresponding table and title with
|
section title and caption with the corresponding table and title with
|
||||||
corresponding text passage.
|
corresponding text passage.
|
||||||
:param num_hard_negatives: Number of hard negative passages (passages which are
|
:param num_hard_negatives: Number of hard negative passages (passages which are
|
||||||
@ -1290,6 +1293,8 @@ class TableTextRetriever(DenseRetriever):
|
|||||||
:param checkpoints_to_keep: The maximum number of train checkpoints to save.
|
:param checkpoints_to_keep: The maximum number of train checkpoints to save.
|
||||||
:param early_stopping: An initialized EarlyStopping object to control early stopping and saving of the best models.
|
:param early_stopping: An initialized EarlyStopping object to control early stopping and saving of the best models.
|
||||||
"""
|
"""
|
||||||
|
if embed_meta_fields is None:
|
||||||
|
embed_meta_fields = ["page_title", "section_title", "caption"]
|
||||||
|
|
||||||
self.processor.embed_meta_fields = embed_meta_fields
|
self.processor.embed_meta_fields = embed_meta_fields
|
||||||
self.processor.data_dir = Path(data_dir)
|
self.processor.data_dir = Path(data_dir)
|
||||||
@ -1393,7 +1398,7 @@ class TableTextRetriever(DenseRetriever):
|
|||||||
max_seq_len_table: int = 256,
|
max_seq_len_table: int = 256,
|
||||||
use_gpu: bool = True,
|
use_gpu: bool = True,
|
||||||
batch_size: int = 16,
|
batch_size: int = 16,
|
||||||
embed_meta_fields: List[str] = ["name", "section_title", "caption"],
|
embed_meta_fields: Optional[List[str]] = None,
|
||||||
use_fast_tokenizers: bool = True,
|
use_fast_tokenizers: bool = True,
|
||||||
similarity_function: str = "dot_product",
|
similarity_function: str = "dot_product",
|
||||||
query_encoder_dir: str = "query_encoder",
|
query_encoder_dir: str = "query_encoder",
|
||||||
@ -1403,6 +1408,8 @@ class TableTextRetriever(DenseRetriever):
|
|||||||
"""
|
"""
|
||||||
Load TableTextRetriever from the specified directory.
|
Load TableTextRetriever from the specified directory.
|
||||||
"""
|
"""
|
||||||
|
if embed_meta_fields is None:
|
||||||
|
embed_meta_fields = ["name", "section_title", "caption"]
|
||||||
|
|
||||||
load_dir = Path(load_dir)
|
load_dir = Path(load_dir)
|
||||||
mm_retriever = cls(
|
mm_retriever = cls(
|
||||||
@ -1441,7 +1448,7 @@ class EmbeddingRetriever(DenseRetriever):
|
|||||||
devices: Optional[List[Union[str, torch.device]]] = None,
|
devices: Optional[List[Union[str, torch.device]]] = None,
|
||||||
use_auth_token: Optional[Union[str, bool]] = None,
|
use_auth_token: Optional[Union[str, bool]] = None,
|
||||||
scale_score: bool = True,
|
scale_score: bool = True,
|
||||||
embed_meta_fields: List[str] = [],
|
embed_meta_fields: Optional[List[str]] = None,
|
||||||
api_key: Optional[str] = None,
|
api_key: Optional[str] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@ -1494,10 +1501,13 @@ class EmbeddingRetriever(DenseRetriever):
|
|||||||
This approach is also used in the TableTextRetriever paper and is likely to improve
|
This approach is also used in the TableTextRetriever paper and is likely to improve
|
||||||
performance if your titles contain meaningful information for retrieval
|
performance if your titles contain meaningful information for retrieval
|
||||||
(topic, entities etc.).
|
(topic, entities etc.).
|
||||||
|
If no value is provided, a default empty list will be created.
|
||||||
:param api_key: The OpenAI API key or the Cohere API key. Required if one wants to use OpenAI/Cohere embeddings.
|
:param api_key: The OpenAI API key or the Cohere API key. Required if one wants to use OpenAI/Cohere embeddings.
|
||||||
For more details see https://beta.openai.com/account/api-keys and https://dashboard.cohere.ai/api-keys
|
For more details see https://beta.openai.com/account/api-keys and https://dashboard.cohere.ai/api-keys
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
if embed_meta_fields is None:
|
||||||
|
embed_meta_fields = []
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=True)
|
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=True)
|
||||||
@ -1929,7 +1939,7 @@ class MultihopEmbeddingRetriever(EmbeddingRetriever):
|
|||||||
devices: Optional[List[Union[str, torch.device]]] = None,
|
devices: Optional[List[Union[str, torch.device]]] = None,
|
||||||
use_auth_token: Optional[Union[str, bool]] = None,
|
use_auth_token: Optional[Union[str, bool]] = None,
|
||||||
scale_score: bool = True,
|
scale_score: bool = True,
|
||||||
embed_meta_fields: List[str] = [],
|
embed_meta_fields: Optional[List[str]] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
:param document_store: An instance of DocumentStore from which to retrieve documents.
|
:param document_store: An instance of DocumentStore from which to retrieve documents.
|
||||||
@ -1977,7 +1987,10 @@ class MultihopEmbeddingRetriever(EmbeddingRetriever):
|
|||||||
This approach is also used in the TableTextRetriever paper and is likely to improve
|
This approach is also used in the TableTextRetriever paper and is likely to improve
|
||||||
performance if your titles contain meaningful information for retrieval
|
performance if your titles contain meaningful information for retrieval
|
||||||
(topic, entities etc.).
|
(topic, entities etc.).
|
||||||
|
If no value is provided, a default empty list will be created.
|
||||||
"""
|
"""
|
||||||
|
if embed_meta_fields is None:
|
||||||
|
embed_meta_fields = []
|
||||||
super().__init__(
|
super().__init__(
|
||||||
embedding_model=embedding_model,
|
embedding_model=embedding_model,
|
||||||
document_store=document_store,
|
document_store=document_store,
|
||||||
|
@ -44,7 +44,7 @@ class MultiModalEmbedder:
|
|||||||
embedding_models: Dict[str, Union[Path, str]], # replace str with ContentTypes starting from Python3.8
|
embedding_models: Dict[str, Union[Path, str]], # replace str with ContentTypes starting from Python3.8
|
||||||
feature_extractors_params: Optional[Dict[str, Dict[str, Any]]] = None,
|
feature_extractors_params: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||||
batch_size: int = 16,
|
batch_size: int = 16,
|
||||||
embed_meta_fields: List[str] = ["name"],
|
embed_meta_fields: Optional[List[str]] = None,
|
||||||
progress_bar: bool = True,
|
progress_bar: bool = True,
|
||||||
devices: Optional[List[Union[str, torch.device]]] = None,
|
devices: Optional[List[Union[str, torch.device]]] = None,
|
||||||
use_auth_token: Optional[Union[str, bool]] = None,
|
use_auth_token: Optional[Union[str, bool]] = None,
|
||||||
@ -67,6 +67,7 @@ class MultiModalEmbedder:
|
|||||||
This is the approach used in the original paper and is likely to improve
|
This is the approach used in the original paper and is likely to improve
|
||||||
performance if your titles contain meaningful information for retrieval
|
performance if your titles contain meaningful information for retrieval
|
||||||
(topic, entities etc.).
|
(topic, entities etc.).
|
||||||
|
If no value is provided, a default with "name" as embedding field is created.
|
||||||
:param progress_bar: Whether to show a tqdm progress bar or not.
|
:param progress_bar: Whether to show a tqdm progress bar or not.
|
||||||
Can be helpful to disable in production deployments to keep the logs clean.
|
Can be helpful to disable in production deployments to keep the logs clean.
|
||||||
:param devices: List of GPU (or CPU) devices to limit inference to certain GPUs and not use all available ones.
|
:param devices: List of GPU (or CPU) devices to limit inference to certain GPUs and not use all available ones.
|
||||||
@ -78,6 +79,8 @@ class MultiModalEmbedder:
|
|||||||
the local token is used, which must be previously created using `transformer-cli login`.
|
the local token is used, which must be previously created using `transformer-cli login`.
|
||||||
For more information, see [Hugging Face documentation](https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained)
|
For more information, see [Hugging Face documentation](https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained)
|
||||||
"""
|
"""
|
||||||
|
if embed_meta_fields is None:
|
||||||
|
embed_meta_fields = ["name"]
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.devices = get_devices(devices)
|
self.devices = get_devices(devices)
|
||||||
|
@ -22,11 +22,11 @@ class MultiModalRetriever(DenseRetriever):
|
|||||||
query_embedding_model: Union[Path, str],
|
query_embedding_model: Union[Path, str],
|
||||||
document_embedding_models: Dict[str, Union[Path, str]], # Replace str with ContentTypes starting Python3.8
|
document_embedding_models: Dict[str, Union[Path, str]], # Replace str with ContentTypes starting Python3.8
|
||||||
query_type: str = "text", # Replace str with ContentTypes starting Python3.8
|
query_type: str = "text", # Replace str with ContentTypes starting Python3.8
|
||||||
query_feature_extractor_params: Dict[str, Any] = {"max_length": 64},
|
query_feature_extractor_params: Optional[Dict[str, Any]] = None,
|
||||||
document_feature_extractors_params: Dict[str, Dict[str, Any]] = {"text": {"max_length": 256}},
|
document_feature_extractors_params: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||||
top_k: int = 10,
|
top_k: int = 10,
|
||||||
batch_size: int = 16,
|
batch_size: int = 16,
|
||||||
embed_meta_fields: List[str] = ["name"],
|
embed_meta_fields: Optional[List[str]] = None,
|
||||||
similarity_function: str = "dot_product",
|
similarity_function: str = "dot_product",
|
||||||
progress_bar: bool = True,
|
progress_bar: bool = True,
|
||||||
devices: Optional[List[Union[str, torch.device]]] = None,
|
devices: Optional[List[Union[str, torch.device]]] = None,
|
||||||
@ -46,14 +46,14 @@ class MultiModalRetriever(DenseRetriever):
|
|||||||
checkpoint with the content type it should handle ("text", "table", "image", and so on).
|
checkpoint with the content type it should handle ("text", "table", "image", and so on).
|
||||||
The format equals the one used by Hugging Face transformers' modelhub models.
|
The format equals the one used by Hugging Face transformers' modelhub models.
|
||||||
:param query_type: The content type of the query ("text", "image" and so on).
|
:param query_type: The content type of the query ("text", "image" and so on).
|
||||||
:param query_feature_extraction_params: The parameters to pass to the feature extractor of the query.
|
:param query_feature_extraction_params: The parameters to pass to the feature extractor of the query. If no value is provided, a default dictionary with "max_length": 64 will be set.
|
||||||
:param document_feature_extraction_params: The parameters to pass to the feature extractor of the documents.
|
:param document_feature_extraction_params: The parameters to pass to the feature extractor of the documents. If no value is provided, a default dictionary with "text": {"max_length": 256} will be set.
|
||||||
:param top_k: How many documents to return per query.
|
:param top_k: How many documents to return per query.
|
||||||
:param batch_size: Number of questions or documents to encode at once. For multiple GPUs, this is
|
:param batch_size: Number of questions or documents to encode at once. For multiple GPUs, this is
|
||||||
the total batch size.
|
the total batch size.
|
||||||
:param embed_meta_fields: Concatenate the provided meta fields to a (text) pair that is then used to create
|
:param embed_meta_fields: Concatenate the provided meta fields to a (text) pair that is then used to create
|
||||||
the embedding. This is likely to improve performance if your titles contain meaningful information
|
the embedding. This is likely to improve performance if your titles contain meaningful information
|
||||||
for retrieval (topic, entities, and so on). Note that only text and table documents support this feature.
|
for retrieval (topic, entities, and so on). Note that only text and table documents support this feature. If no values is provided, a default with "name" as embedding field will be created.
|
||||||
:param similarity_function: Which function to apply for calculating the similarity of query and document
|
:param similarity_function: Which function to apply for calculating the similarity of query and document
|
||||||
embeddings during training. Options: `dot_product` (default) or `cosine`.
|
embeddings during training. Options: `dot_product` (default) or `cosine`.
|
||||||
:param progress_bar: Whether to show a tqdm progress bar or not.
|
:param progress_bar: Whether to show a tqdm progress bar or not.
|
||||||
@ -72,6 +72,12 @@ class MultiModalRetriever(DenseRetriever):
|
|||||||
range are scaled to a range of [0,1], where 1 means extremely relevant.
|
range are scaled to a range of [0,1], where 1 means extremely relevant.
|
||||||
Otherwise raw similarity scores (for example, cosine or dot_product) are used.
|
Otherwise raw similarity scores (for example, cosine or dot_product) are used.
|
||||||
"""
|
"""
|
||||||
|
if query_feature_extractor_params is None:
|
||||||
|
query_feature_extractor_params = {"max_length": 64}
|
||||||
|
if document_feature_extractors_params is None:
|
||||||
|
document_feature_extractors_params = {"text": {"max_length": 256}}
|
||||||
|
if embed_meta_fields is None:
|
||||||
|
embed_meta_fields = ["name"]
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.similarity_function = similarity_function
|
self.similarity_function = similarity_function
|
||||||
|
@ -745,12 +745,12 @@ class Pipeline:
|
|||||||
cls,
|
cls,
|
||||||
index_pipeline: Pipeline,
|
index_pipeline: Pipeline,
|
||||||
query_pipeline: Pipeline,
|
query_pipeline: Pipeline,
|
||||||
index_params: dict = {},
|
index_params: Optional[Dict] = None,
|
||||||
query_params: dict = {},
|
query_params: Optional[Dict] = None,
|
||||||
dataset: str = "scifact",
|
dataset: str = "scifact",
|
||||||
dataset_dir: Path = Path("."),
|
dataset_dir: Path = Path("."),
|
||||||
num_documents: Optional[int] = None,
|
num_documents: Optional[int] = None,
|
||||||
top_k_values: List[int] = [1, 3, 5, 10, 100, 1000],
|
top_k_values: Optional[List[int]] = None,
|
||||||
keep_index: bool = False,
|
keep_index: bool = False,
|
||||||
) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float]]:
|
) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float]]:
|
||||||
"""
|
"""
|
||||||
@ -765,7 +765,7 @@ class Pipeline:
|
|||||||
:param dataset_dir: The directory to store the dataset to.
|
:param dataset_dir: The directory to store the dataset to.
|
||||||
:param num_documents: Maximum number of documents to load from given dataset. If set to None (default)
|
:param num_documents: Maximum number of documents to load from given dataset. If set to None (default)
|
||||||
or to a value larger than the number of documents in the dataset, the full dataset is loaded.
|
or to a value larger than the number of documents in the dataset, the full dataset is loaded.
|
||||||
:param top_k_values: The top_k values each metric will be calculated for.
|
:param top_k_values: The top_k values each metric will be calculated for. By default, the values are 1, 3, 5, 10, 100, and 1000.
|
||||||
:param keep_index: Whether to keep the index after evaluation.
|
:param keep_index: Whether to keep the index after evaluation.
|
||||||
If True the index will be kept after beir evaluation. Otherwise it will be deleted immediately afterwards.
|
If True the index will be kept after beir evaluation. Otherwise it will be deleted immediately afterwards.
|
||||||
Defaults to False.
|
Defaults to False.
|
||||||
@ -773,6 +773,12 @@ class Pipeline:
|
|||||||
Returns a tuple containing the ncdg, map, recall and precision scores.
|
Returns a tuple containing the ncdg, map, recall and precision scores.
|
||||||
Each metric is represented by a dictionary containing the scores for each top_k value.
|
Each metric is represented by a dictionary containing the scores for each top_k value.
|
||||||
"""
|
"""
|
||||||
|
if index_params is None:
|
||||||
|
index_params = {}
|
||||||
|
if query_params is None:
|
||||||
|
query_params = {}
|
||||||
|
if top_k_values is None:
|
||||||
|
top_k_values = [1, 3, 5, 10, 100, 1000]
|
||||||
try:
|
try:
|
||||||
from beir import util
|
from beir import util
|
||||||
from beir.datasets.data_loader import GenericDataLoader
|
from beir.datasets.data_loader import GenericDataLoader
|
||||||
@ -855,11 +861,11 @@ class Pipeline:
|
|||||||
experiment_tracking_tool: Literal["mlflow", None] = None,
|
experiment_tracking_tool: Literal["mlflow", None] = None,
|
||||||
experiment_tracking_uri: Optional[str] = None,
|
experiment_tracking_uri: Optional[str] = None,
|
||||||
corpus_file_metas: Optional[List[Dict[str, Any]]] = None,
|
corpus_file_metas: Optional[List[Dict[str, Any]]] = None,
|
||||||
corpus_meta: Dict[str, Any] = {},
|
corpus_meta: Optional[Dict[str, Any]] = None,
|
||||||
evaluation_set_meta: Dict[str, Any] = {},
|
evaluation_set_meta: Optional[Dict[str, Any]] = None,
|
||||||
pipeline_meta: Dict[str, Any] = {},
|
pipeline_meta: Optional[Dict[str, Any]] = None,
|
||||||
index_params: dict = {},
|
index_params: Optional[Dict] = None,
|
||||||
query_params: dict = {},
|
query_params: Optional[Dict] = None,
|
||||||
sas_model_name_or_path: Optional[str] = None,
|
sas_model_name_or_path: Optional[str] = None,
|
||||||
sas_batch_size: int = 32,
|
sas_batch_size: int = 32,
|
||||||
sas_use_gpu: bool = True,
|
sas_use_gpu: bool = True,
|
||||||
@ -997,6 +1003,17 @@ class Pipeline:
|
|||||||
Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scoring ~75 in total.
|
Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scoring ~75 in total.
|
||||||
:param context_matching_threshold: Score threshold that candidates must surpass to be included into the result list. Range: [0,100]
|
:param context_matching_threshold: Score threshold that candidates must surpass to be included into the result list. Range: [0,100]
|
||||||
"""
|
"""
|
||||||
|
if corpus_meta is None:
|
||||||
|
corpus_meta = {}
|
||||||
|
if evaluation_set_meta is None:
|
||||||
|
evaluation_set_meta = {}
|
||||||
|
if pipeline_meta is None:
|
||||||
|
pipeline_meta = {}
|
||||||
|
if index_params is None:
|
||||||
|
index_params = {}
|
||||||
|
if query_params is None:
|
||||||
|
query_params = {}
|
||||||
|
|
||||||
if experiment_tracking_tool is not None:
|
if experiment_tracking_tool is not None:
|
||||||
tracking_head_cls = TRACKING_TOOL_TO_HEAD.get(experiment_tracking_tool, None)
|
tracking_head_cls = TRACKING_TOOL_TO_HEAD.get(experiment_tracking_tool, None)
|
||||||
if tracking_head_cls is None:
|
if tracking_head_cls is None:
|
||||||
@ -2213,7 +2230,7 @@ class Pipeline:
|
|||||||
"document_id_or_answer",
|
"document_id_or_answer",
|
||||||
] = "document_id_or_answer",
|
] = "document_id_or_answer",
|
||||||
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
||||||
wrong_examples_fields: List[str] = ["answer", "context", "document_id"],
|
wrong_examples_fields: Optional[List[str]] = None,
|
||||||
max_characters_per_field: int = 150,
|
max_characters_per_field: int = 150,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@ -2249,9 +2266,11 @@ class Pipeline:
|
|||||||
- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
|
- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
|
||||||
The default value is 'any'.
|
The default value is 'any'.
|
||||||
In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
|
In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
|
||||||
:param wrong_examples_fields: A list of fields to include in the worst samples.
|
:param wrong_examples_fields: A list of fields to include in the worst samples. By default, "answer", "context", and "document_id" are included.
|
||||||
:param max_characters_per_field: The maximum number of characters to include in the worst samples report (per field).
|
:param max_characters_per_field: The maximum number of characters to include in the worst samples report (per field).
|
||||||
"""
|
"""
|
||||||
|
if wrong_examples_fields is None:
|
||||||
|
wrong_examples_fields = ["answer", "context", "document_id"]
|
||||||
graph = DiGraph(self.graph.edges)
|
graph = DiGraph(self.graph.edges)
|
||||||
print_eval_report(
|
print_eval_report(
|
||||||
eval_result=eval_result,
|
eval_result=eval_result,
|
||||||
|
@ -202,7 +202,7 @@ class RayPipeline(Pipeline):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _create_ray_deployment(
|
def _create_ray_deployment(
|
||||||
cls, component_name: str, pipeline_config: dict, serve_deployment_kwargs: Optional[Dict[str, Any]] = {}
|
cls, component_name: str, pipeline_config: dict, serve_deployment_kwargs: Optional[Dict[str, Any]] = None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Create a Ray Deployment for the Component.
|
Create a Ray Deployment for the Component.
|
||||||
@ -215,6 +215,8 @@ class RayPipeline(Pipeline):
|
|||||||
Ray Serve API docs (https://docs.ray.io/en/latest/serve/package-ref.html)
|
Ray Serve API docs (https://docs.ray.io/en/latest/serve/package-ref.html)
|
||||||
under the `ray.serve.deployment()` method
|
under the `ray.serve.deployment()` method
|
||||||
"""
|
"""
|
||||||
|
if serve_deployment_kwargs is None:
|
||||||
|
serve_deployment_kwargs = {}
|
||||||
RayDeployment = serve.deployment(
|
RayDeployment = serve.deployment(
|
||||||
_RayDeploymentWrapper, name=component_name, **serve_deployment_kwargs # type: ignore
|
_RayDeploymentWrapper, name=component_name, **serve_deployment_kwargs # type: ignore
|
||||||
)
|
)
|
||||||
|
@ -241,7 +241,7 @@ class BaseStandardPipeline(ABC):
|
|||||||
"document_id_or_answer",
|
"document_id_or_answer",
|
||||||
] = "document_id_or_answer",
|
] = "document_id_or_answer",
|
||||||
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
||||||
wrong_examples_fields: List[str] = ["answer", "context", "document_id"],
|
wrong_examples_fields: Optional[List[str]] = None,
|
||||||
max_characters_per_field: int = 150,
|
max_characters_per_field: int = 150,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@ -277,9 +277,11 @@ class BaseStandardPipeline(ABC):
|
|||||||
- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
|
- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
|
||||||
The default value is 'any'.
|
The default value is 'any'.
|
||||||
In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
|
In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
|
||||||
:param wrong_examples_fields: A list of field names to include in the worst samples.
|
:param wrong_examples_fields: A list of field names to include in the worst samples. By default, "answer", "context", and "document_id" are used.
|
||||||
:param max_characters_per_field: The maximum number of characters per wrong example to show (per field).
|
:param max_characters_per_field: The maximum number of characters per wrong example to show (per field).
|
||||||
"""
|
"""
|
||||||
|
if wrong_examples_fields is None:
|
||||||
|
wrong_examples_fields = ["answer", "context", "document_id"]
|
||||||
if metrics_filter is None:
|
if metrics_filter is None:
|
||||||
metrics_filter = self.metrics_filter
|
metrics_filter = self.metrics_filter
|
||||||
self.pipeline.print_eval_report(
|
self.pipeline.print_eval_report(
|
||||||
|
@ -178,7 +178,7 @@ def print_eval_report(
|
|||||||
"document_id", "context", "document_id_and_context", "document_id_or_context", "answer", "document_id_or_answer"
|
"document_id", "context", "document_id_and_context", "document_id_or_context", "answer", "document_id_or_answer"
|
||||||
] = "document_id_or_answer",
|
] = "document_id_or_answer",
|
||||||
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
||||||
wrong_examples_fields: List[str] = ["answer", "context", "document_id"],
|
wrong_examples_fields: Optional[List[str]] = None,
|
||||||
max_characters_per_field: int = 150,
|
max_characters_per_field: int = 150,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@ -216,9 +216,11 @@ def print_eval_report(
|
|||||||
- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
|
- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
|
||||||
The default value is 'any'.
|
The default value is 'any'.
|
||||||
In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
|
In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
|
||||||
:param wrong_examples_fields: A list of field names that should be included in the wrong examples.
|
:param wrong_examples_fields: A list of field names that should be included in the wrong examples. By default, "answer", "context", and "document_id" are used.
|
||||||
:param max_characters_per_field: The maximum number of characters to show in the wrong examples report (per field).
|
:param max_characters_per_field: The maximum number of characters to show in the wrong examples report (per field).
|
||||||
"""
|
"""
|
||||||
|
if wrong_examples_fields is None:
|
||||||
|
wrong_examples_fields = ["answer", "context", "document_id"]
|
||||||
if any(degree > 1 for node, degree in graph.out_degree):
|
if any(degree > 1 for node, degree in graph.out_degree):
|
||||||
logger.warning("Pipelines with junctions are currently not supported.")
|
logger.warning("Pipelines with junctions are currently not supported.")
|
||||||
return
|
return
|
||||||
@ -309,9 +311,11 @@ def _format_wrong_examples_report(
|
|||||||
"document_id", "context", "document_id_and_context", "document_id_or_context", "answer", "document_id_or_answer"
|
"document_id", "context", "document_id_and_context", "document_id_or_context", "answer", "document_id_or_answer"
|
||||||
] = "document_id_or_answer",
|
] = "document_id_or_answer",
|
||||||
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
||||||
fields: List[str] = ["answer", "context", "document_id"],
|
fields: Optional[List[str]] = None,
|
||||||
max_chars: int = 150,
|
max_chars: int = 150,
|
||||||
):
|
):
|
||||||
|
if fields is None:
|
||||||
|
fields = ["answer", "context", "document_id"]
|
||||||
examples = {
|
examples = {
|
||||||
node: eval_result.wrong_examples(
|
node: eval_result.wrong_examples(
|
||||||
node, document_scope=document_scope, answer_scope=answer_scope, n=n_wrong_examples
|
node, document_scope=document_scope, answer_scope=answer_scope, n=n_wrong_examples
|
||||||
|
@ -302,7 +302,9 @@ class SpeechDocument(Document):
|
|||||||
return f"<SpeechDocument: id={self.id}, content=None>"
|
return f"<SpeechDocument: id={self.id}, content=None>"
|
||||||
return f"<SpeechDocument: id={self.id}, content='{self.content[:100]}{'...' if len(self.content) > 100 else ''}', content_audio={self.content_audio}>"
|
return f"<SpeechDocument: id={self.id}, content='{self.content[:100]}{'...' if len(self.content) > 100 else ''}', content_audio={self.content_audio}>"
|
||||||
|
|
||||||
def to_dict(self, field_map={}) -> Dict:
|
def to_dict(self, field_map=None) -> Dict:
|
||||||
|
if field_map is None:
|
||||||
|
field_map = {}
|
||||||
dictionary = super().to_dict(field_map=field_map)
|
dictionary = super().to_dict(field_map=field_map)
|
||||||
for key, value in dictionary.items():
|
for key, value in dictionary.items():
|
||||||
if isinstance(value, Path):
|
if isinstance(value, Path):
|
||||||
@ -310,7 +312,9 @@ class SpeechDocument(Document):
|
|||||||
return dictionary
|
return dictionary
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dict(cls, dict, field_map={}, id_hash_keys=None):
|
def from_dict(cls, dict, field_map=None, id_hash_keys=None):
|
||||||
|
if field_map is None:
|
||||||
|
field_map = {}
|
||||||
doc = super().from_dict(dict=dict, field_map=field_map, id_hash_keys=id_hash_keys)
|
doc = super().from_dict(dict=dict, field_map=field_map, id_hash_keys=id_hash_keys)
|
||||||
doc.content_audio = Path(dict["content_audio"])
|
doc.content_audio = Path(dict["content_audio"])
|
||||||
return doc
|
return doc
|
||||||
|
@ -133,7 +133,7 @@ def send_event(func):
|
|||||||
return wrapper
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
def send_custom_event(event: str = "", payload: Dict[str, Any] = {}):
|
def send_custom_event(event: str = "", payload: Optional[Dict[str, Any]] = None):
|
||||||
"""
|
"""
|
||||||
This method can be called directly from anywhere in Haystack to send an event.
|
This method can be called directly from anywhere in Haystack to send an event.
|
||||||
Enriches the given event with metadata and sends it to the posthog server if telemetry is enabled.
|
Enriches the given event with metadata and sends it to the posthog server if telemetry is enabled.
|
||||||
@ -143,6 +143,8 @@ def send_custom_event(event: str = "", payload: Dict[str, Any] = {}):
|
|||||||
:param payload: A dictionary containing event meta data, e.g., parameter settings
|
:param payload: A dictionary containing event meta data, e.g., parameter settings
|
||||||
"""
|
"""
|
||||||
global user_id # pylint: disable=global-statement
|
global user_id # pylint: disable=global-statement
|
||||||
|
if payload is None:
|
||||||
|
payload = {}
|
||||||
try:
|
try:
|
||||||
|
|
||||||
def send_request(payload: Dict[str, Any]):
|
def send_request(payload: Dict[str, Any]):
|
||||||
|
@ -144,7 +144,7 @@ class DeepsetCloudClient:
|
|||||||
def post(
|
def post(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
json: dict = {},
|
json: Optional[Dict] = None,
|
||||||
data: Optional[Any] = None,
|
data: Optional[Any] = None,
|
||||||
query_params: Optional[dict] = None,
|
query_params: Optional[dict] = None,
|
||||||
headers: Optional[dict] = None,
|
headers: Optional[dict] = None,
|
||||||
@ -152,6 +152,8 @@ class DeepsetCloudClient:
|
|||||||
files: Optional[Any] = None,
|
files: Optional[Any] = None,
|
||||||
raise_on_error: bool = True,
|
raise_on_error: bool = True,
|
||||||
):
|
):
|
||||||
|
if json is None:
|
||||||
|
json = {}
|
||||||
return self._execute_request(
|
return self._execute_request(
|
||||||
method="POST",
|
method="POST",
|
||||||
url=url,
|
url=url,
|
||||||
@ -167,7 +169,7 @@ class DeepsetCloudClient:
|
|||||||
def post_with_auto_paging(
|
def post_with_auto_paging(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
json: dict = {},
|
json: Optional[Dict] = None,
|
||||||
data: Optional[Any] = None,
|
data: Optional[Any] = None,
|
||||||
query_params: Optional[dict] = None,
|
query_params: Optional[dict] = None,
|
||||||
headers: Optional[dict] = None,
|
headers: Optional[dict] = None,
|
||||||
@ -175,6 +177,8 @@ class DeepsetCloudClient:
|
|||||||
raise_on_error: bool = True,
|
raise_on_error: bool = True,
|
||||||
auto_paging_page_size: Optional[int] = None,
|
auto_paging_page_size: Optional[int] = None,
|
||||||
):
|
):
|
||||||
|
if json is None:
|
||||||
|
json = {}
|
||||||
return self._execute_auto_paging_request(
|
return self._execute_auto_paging_request(
|
||||||
method="POST",
|
method="POST",
|
||||||
url=url,
|
url=url,
|
||||||
@ -211,7 +215,7 @@ class DeepsetCloudClient:
|
|||||||
def put_with_auto_paging(
|
def put_with_auto_paging(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
json: dict = {},
|
json: Optional[Dict] = None,
|
||||||
data: Optional[Any] = None,
|
data: Optional[Any] = None,
|
||||||
query_params: Optional[dict] = None,
|
query_params: Optional[dict] = None,
|
||||||
headers: Optional[dict] = None,
|
headers: Optional[dict] = None,
|
||||||
@ -219,6 +223,8 @@ class DeepsetCloudClient:
|
|||||||
raise_on_error: bool = True,
|
raise_on_error: bool = True,
|
||||||
auto_paging_page_size: Optional[int] = None,
|
auto_paging_page_size: Optional[int] = None,
|
||||||
):
|
):
|
||||||
|
if json is None:
|
||||||
|
json = {}
|
||||||
return self._execute_auto_paging_request(
|
return self._execute_auto_paging_request(
|
||||||
method="PUT",
|
method="PUT",
|
||||||
url=url,
|
url=url,
|
||||||
|
@ -278,7 +278,6 @@ disable = [
|
|||||||
"unspecified-encoding",
|
"unspecified-encoding",
|
||||||
"unidiomatic-typecheck",
|
"unidiomatic-typecheck",
|
||||||
"no-name-in-module",
|
"no-name-in-module",
|
||||||
"dangerous-default-value",
|
|
||||||
"consider-using-with",
|
"consider-using-with",
|
||||||
"redefined-outer-name",
|
"redefined-outer-name",
|
||||||
"arguments-renamed",
|
"arguments-renamed",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user