refactor: replace mutable default arguments (#4070)

* refactor: replace mutable default arguments

* change type annotation in BasePreProcessor to Optional[List]
This commit is contained in:
Julian Risch 2023-02-07 09:30:33 +01:00 committed by GitHub
parent 3273a2714d
commit 0e282e5ca4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
28 changed files with 216 additions and 94 deletions

View File

@ -49,7 +49,7 @@ class InMemoryDocumentStore(KeywordDocumentStore):
use_bm25: bool = False,
bm25_tokenization_regex: str = r"(?u)\b\w\w+\b",
bm25_algorithm: Literal["BM25Okapi", "BM25L", "BM25Plus"] = "BM25Okapi",
bm25_parameters: dict = {},
bm25_parameters: Optional[Dict] = None,
):
"""
:param index: The documents are scoped to an index attribute that can be used when writing, querying,
@ -87,7 +87,10 @@ class InMemoryDocumentStore(KeywordDocumentStore):
:param bm25_parameters: Parameters for BM25 implementation in a dictionary format.
For example: {'k1':1.5, 'b':0.75, 'epsilon':0.25}
You can learn more about these parameters by visiting https://github.com/dorianbrown/rank_bm25
By default, no parameters are set.
"""
if bm25_parameters is None:
bm25_parameters = {}
super().__init__()
self.indexes: Dict[str, Dict] = defaultdict(dict)

View File

@ -68,7 +68,7 @@ class PineconeDocumentStore(BaseDocumentStore):
progress_bar: bool = True,
duplicate_documents: str = "overwrite",
recreate_index: bool = False,
metadata_config: dict = {"indexed": []},
metadata_config: Optional[Dict] = None,
validate_index_sync: bool = True,
):
"""
@ -106,6 +106,8 @@ class PineconeDocumentStore(BaseDocumentStore):
Should be in the format `{"indexed": ["metadata-field-1", "metadata-field-2", "metadata-field-n"]}`. By default,
no fields are indexed.
"""
if metadata_config is None:
metadata_config = {"indexed": []}
# Connect to Pinecone server using python client binding
if not api_key:
raise PineconeDocumentStoreError(
@ -201,12 +203,14 @@ class PineconeDocumentStore(BaseDocumentStore):
replicas: Optional[int] = 1,
shards: Optional[int] = 1,
recreate_index: bool = False,
metadata_config: dict = {"indexed": []},
metadata_config: Optional[Dict] = None,
):
"""
Create a new index for storing documents in case an
index with the name doesn't exist already.
"""
if metadata_config is None:
metadata_config = {"indexed": []}
index = self._index_name(index)
if recreate_index:

View File

@ -555,7 +555,7 @@ class DataSiloForCrossVal:
def make(
cls,
datasilo: DataSilo,
sets: List[str] = ["train", "dev", "test"],
sets: Optional[List[str]] = None,
n_splits: int = 5,
shuffle: bool = True,
random_state: Optional[int] = None,
@ -568,7 +568,7 @@ class DataSiloForCrossVal:
original data silo passed on.
:param datasilo: The data silo that contains the original data.
:param sets: Which sets to use to create the xval folds (strings)
:param sets: Which sets to use to create the xval folds (strings). By default, "train", "dev", and "test" are used.
:param n_splits: number of folds to create
:param shuffle: shuffle each class' samples before splitting
:param random_state: random state for shuffling
@ -576,6 +576,8 @@ class DataSiloForCrossVal:
It is never done with question answering.
:param n_neg_answers_per_question: number of negative answers per question to include for training
"""
if sets is None:
sets = ["train", "dev", "test"]
if "question_answering" in datasilo.processor.tasks and n_inner_splits is None: # type: ignore
return cls._make_question_answering(
datasilo, sets, n_splits, shuffle, random_state, n_neg_answers_per_question
@ -588,7 +590,7 @@ class DataSiloForCrossVal:
def _make_question_answering(
cls,
datasilo: DataSilo,
sets: List[str] = ["train", "dev", "test"],
sets: Optional[List[str]] = None,
n_splits: int = 5,
shuffle: bool = True,
random_state: Optional[int] = None,
@ -600,12 +602,14 @@ class DataSiloForCrossVal:
data for question-answering-
:param datasilo: The data silo that contains the original data.
:param sets: Which sets to use to create the xval folds (strings).
:param sets: Which sets to use to create the xval folds (strings). By default, "train", "dev", and "test" are used.
:param n_splits: Number of folds to create.
:param shuffle: Shuffle each class' samples before splitting.
:param random_state: Random state for shuffling.
:param n_neg_answers_per_question: Number of negative answers per question to include for training.
"""
if sets is None:
sets = ["train", "dev", "test"]
assert "id" in datasilo.tensor_names, f"Expected tensor 'id' in tensor names, found {datasilo.tensor_names}" # type: ignore
assert "labels" in datasilo.tensor_names, f"Expected tensor 'labels' in tensor names, found {datasilo.tensor_names}" # type: ignore

View File

@ -59,7 +59,7 @@ class Processor(ABC):
test_filename: Optional[Union[Path, str]],
dev_split: float,
data_dir: Optional[Union[Path, str]],
tasks: Dict = {},
tasks: Optional[Dict] = None,
proxies: Optional[Dict] = None,
multithreading_rust: Optional[bool] = True,
):
@ -82,6 +82,8 @@ class Processor(ABC):
Note: Enabling multithreading in Rust AND multiprocessing in python might cause
deadlocks.
"""
if tasks is None:
tasks = {}
if not multithreading_rust:
os.environ["RAYON_RS_NUM_CPUS"] = "1"
@ -313,7 +315,7 @@ class Processor(ABC):
@abstractmethod
def dataset_from_dicts(
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False
):
raise NotImplementedError()
@ -444,7 +446,7 @@ class SquadProcessor(Processor):
)
def dataset_from_dicts(
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False
):
"""
Convert input dictionaries into a pytorch dataset for Question Answering.
@ -456,6 +458,8 @@ class SquadProcessor(Processor):
:param indices: list, indices used during multiprocessing so that IDs assigned to our baskets is unique
:param return_baskets: boolean, whether to return the baskets or not (baskets are needed during inference)
"""
if indices is None:
indices = []
# Convert to standard format
pre_baskets = [self.convert_qa_input_dict(x) for x in dicts] # TODO move to input object conversion
@ -990,7 +994,7 @@ class TextSimilarityProcessor(Processor):
json.dump(config, file)
def dataset_from_dicts(
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False
):
"""
Convert input dictionaries into a pytorch dataset for TextSimilarity (e.g. DPR).
@ -1013,6 +1017,8 @@ class TextSimilarityProcessor(Processor):
:param return_baskets: whether to return the baskets or not (baskets are needed during inference)
:return: dataset, tensor_names, problematic_ids, [baskets]
"""
if indices is None:
indices = []
# Take the dict and insert into our basket structure, this stages also adds an internal IDs
baskets = self._fill_baskets(dicts, indices)
@ -1254,7 +1260,7 @@ class TableTextSimilarityProcessor(Processor):
dev_split: float = 0.1,
proxies: Optional[Dict] = None,
max_samples: Optional[int] = None,
embed_meta_fields: List[str] = ["page_title", "section_title", "caption"],
embed_meta_fields: Optional[List[str]] = None,
num_positives: int = 1,
num_hard_negatives: int = 1,
shuffle_negatives: bool = True,
@ -1284,7 +1290,7 @@ class TableTextSimilarityProcessor(Processor):
:param proxies: Proxy configuration to allow downloads of remote datasets.
Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
:param max_samples: maximum number of samples to use.
:param embed_meta_fields: List of meta fields to embed in text passages and tables during tensorization.
:param embed_meta_fields: List of meta fields to embed in text passages and tables during tensorization. By default, "page_title", "section_title", and "caption" are used.
:param num_hard_negatives: Maximum number of hard negative context passages in a sample.
:param num_positives: Maximum number of positive context passages in a sample.
:param shuffle_negatives: Whether to shuffle all the hard_negative passages before selecting the
@ -1296,6 +1302,8 @@ class TableTextSimilarityProcessor(Processor):
"""
# TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs
if embed_meta_fields is None:
embed_meta_fields = ["page_title", "section_title", "caption"]
# Custom processor attributes
self.max_samples = max_samples
self.query_tokenizer = query_tokenizer
@ -1511,7 +1519,7 @@ class TableTextSimilarityProcessor(Processor):
return standard_dicts
def dataset_from_dicts(
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False
):
"""
Convert input dictionaries into a pytorch dataset for TextSimilarity.
@ -1533,7 +1541,8 @@ class TableTextSimilarityProcessor(Processor):
:param indices: list, indices used during multiprocessing so that IDs assigned to our baskets is unique
:param return_baskets: boolean, whether to return the baskets or not (baskets are needed during inference)
"""
if indices is None:
indices = []
# Take the dict and insert into our basket structure, this stages also adds an internal IDs
baskets = self._fill_baskets(dicts, indices)
@ -1861,8 +1870,10 @@ class TextClassificationProcessor(Processor):
raise NotImplementedError
def dataset_from_dicts(
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False
):
if indices is None:
indices = []
baskets = []
# Tokenize in batches
texts = [x["text"] for x in dicts]
@ -2043,10 +2054,12 @@ class UnlabeledTextProcessor(Processor):
test_filename: Optional[Union[Path, str]] = None,
dev_split: float = 0,
data_dir: Optional[Union[Path, str]] = None,
tasks: Dict = {},
tasks: Optional[Dict] = None,
proxies: Optional[Dict] = None,
multithreading_rust: Optional[bool] = True,
):
if tasks is None:
tasks = {}
super().__init__(
tokenizer,
max_seq_len,
@ -2069,8 +2082,10 @@ class UnlabeledTextProcessor(Processor):
return dicts
def dataset_from_dicts(
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False
):
if indices is None:
indices = []
if return_baskets:
raise NotImplementedError("return_baskets is not supported by UnlabeledTextProcessor")
texts = [dict_["text"] for dict_ in dicts]

View File

@ -42,8 +42,8 @@ class BiAdaptiveModel(nn.Module):
prediction_heads: List[PredictionHead],
embeds_dropout_prob: float = 0.1,
device: torch.device = torch.device("cuda"),
lm1_output_types: Union[str, List[str]] = ["per_sequence"],
lm2_output_types: Union[str, List[str]] = ["per_sequence"],
lm1_output_types: Optional[Union[str, List[str]]] = None,
lm2_output_types: Optional[Union[str, List[str]]] = None,
loss_aggregation_fn: Optional[Callable] = None,
):
"""
@ -54,12 +54,12 @@ class BiAdaptiveModel(nn.Module):
language models will be zeroed.
:param lm1_output_types: How to extract the embeddings from the final layer of the first language model. When set
to "per_token", one embedding will be extracted per input token. If set to
"per_sequence", a single embedding will be extracted to represent the full
"per_sequence" (default), a single embedding will be extracted to represent the full
input sequence. Can either be a single string, or a list of strings,
one for each prediction head.
:param lm2_output_types: How to extract the embeddings from the final layer of the second language model. When set
to "per_token", one embedding will be extracted per input token. If set to
"per_sequence", a single embedding will be extracted to represent the full
"per_sequence" (default), a single embedding will be extracted to represent the full
input sequence. Can either be a single string, or a list of strings,
one for each prediction head.
:param device: The device on which this model will operate. Either torch.device("cpu") or torch.device("cuda").
@ -74,6 +74,10 @@ class BiAdaptiveModel(nn.Module):
Note: The loss at this stage is per sample, i.e one tensor of
shape (batchsize) per prediction head.
"""
if lm1_output_types is None:
lm1_output_types = ["per_sequence"]
if lm2_output_types is None:
lm2_output_types = ["per_sequence"]
super(BiAdaptiveModel, self).__init__()
self.device = device

View File

@ -231,7 +231,7 @@ class QuestionAnsweringHead(PredictionHead):
def __init__(
self,
layer_dims: List[int] = [768, 2],
layer_dims: Optional[List[int]] = None,
task_name: str = "question_answering",
no_ans_boost: float = 0.0,
context_window_size: int = 100,
@ -244,7 +244,7 @@ class QuestionAnsweringHead(PredictionHead):
**kwargs,
):
"""
:param layer_dims: dimensions of Feed Forward block, e.g. [768,2], for adjusting to BERT embedding. Output should be always 2
:param layer_dims: dimensions of Feed Forward block, e.g. [768,2] used by default, for adjusting to BERT embedding. Output should be always 2
:param kwargs: placeholder for passing generic parameters
:param no_ans_boost: How much the no_answer logit is boosted/increased.
The higher the value, the more likely a "no answer possible given the input text" is returned by the model
@ -260,6 +260,8 @@ class QuestionAnsweringHead(PredictionHead):
:param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence.
Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default).
"""
if layer_dims is None:
layer_dims = [768, 2]
super(QuestionAnsweringHead, self).__init__()
if len(kwargs) > 0:
logger.warning(

View File

@ -248,7 +248,7 @@ class QAPred(Pred):
aggregation_level: str,
no_answer_gap: float,
ground_truth_answer: Optional[str] = None,
answer_types: List[str] = [],
answer_types: Optional[List[str]] = None,
):
"""
:param id: The id of the passage or document
@ -262,6 +262,8 @@ class QAPred(Pred):
:param ground_truth_answer: Ground truth answers
:param answer_types: List of answer_types supported by this task e.g. ["span", "yes_no", "no_answer"]
"""
if answer_types is None:
answer_types = []
super().__init__(id, prediction, context)
self.question = question
self.token_offsets = token_offsets

View File

@ -44,9 +44,9 @@ class TriAdaptiveModel(nn.Module):
prediction_heads: List[PredictionHead],
embeds_dropout_prob: float = 0.1,
device: torch.device = torch.device("cuda"),
lm1_output_types: Union[str, List[str]] = ["per_sequence"],
lm2_output_types: Union[str, List[str]] = ["per_sequence"],
lm3_output_types: Union[str, List[str]] = ["per_sequence"],
lm1_output_types: Optional[Union[str, List[str]]] = None,
lm2_output_types: Optional[Union[str, List[str]]] = None,
lm3_output_types: Optional[Union[str, List[str]]] = None,
loss_aggregation_fn: Optional[Callable] = None,
):
"""
@ -58,17 +58,17 @@ class TriAdaptiveModel(nn.Module):
language model will be zeroed.
:param lm1_output_types: How to extract the embeddings from the final layer of the first language model. When set
to "per_token", one embedding will be extracted per input token. If set to
"per_sequence", a single embedding will be extracted to represent the full
"per_sequence" (default), a single embedding will be extracted to represent the full
input sequence. Can either be a single string, or a list of strings,
one for each prediction head.
:param lm2_output_types: How to extract the embeddings from the final layer of the second language model. When set
to "per_token", one embedding will be extracted per input token. If set to
"per_sequence", a single embedding will be extracted to represent the full
"per_sequence" (default), a single embedding will be extracted to represent the full
input sequence. Can either be a single string, or a list of strings,
one for each prediction head.
:param lm3_output_types: How to extract the embeddings from the final layer of the third language model. When set
to "per_token", one embedding will be extracted per input token. If set to
"per_sequence", a single embedding will be extracted to represent the full
"per_sequence" (default), a single embedding will be extracted to represent the full
input sequence. Can either be a single string, or a list of strings,
one for each prediction head.
:param device: The device on which this model will operate. Either torch.device("cpu") or torch.device("cuda").
@ -83,7 +83,12 @@ class TriAdaptiveModel(nn.Module):
Note: The loss at this stage is per sample, i.e one tensor of
shape (batchsize) per prediction head.
"""
if lm1_output_types is None:
lm1_output_types = ["per_sequence"]
if lm2_output_types is None:
lm2_output_types = ["per_sequence"]
if lm3_output_types is None:
lm3_output_types = ["per_sequence"]
super(TriAdaptiveModel, self).__init__()
self.device = device
self.language_model1 = language_model1.to(device)

View File

@ -261,10 +261,12 @@ def create_schema_for_node_class(node_class: Type[BaseComponent]) -> Tuple[Dict[
return component_schema, {"$ref": f"#/definitions/{component_name}"}
def get_json_schema(filename: str, version: str, modules: List[str] = ["haystack.document_stores", "haystack.nodes"]):
def get_json_schema(filename: str, version: str, modules: Optional[List[str]] = None):
"""
Generate JSON schema for Haystack pipelines.
"""
if modules is None:
modules = ["haystack.document_stores", "haystack.nodes"]
schema_definitions = {} # All the schemas for the node and accessory classes
node_refs = [] # References to the nodes only (accessory classes cannot be listed among the nodes in a config)

View File

@ -1,5 +1,5 @@
import mimetypes
from typing import Any, Dict, List, Union
from typing import Any, Dict, List, Union, Optional
import logging
from pathlib import Path
@ -29,14 +29,16 @@ class FileTypeClassifier(BaseComponent):
outgoing_edges = len(DEFAULT_TYPES)
def __init__(self, supported_types: List[str] = DEFAULT_TYPES):
def __init__(self, supported_types: Optional[List[str]] = None):
"""
Node that sends out files on a different output edge depending on their extension.
:param supported_types: The file types that this node can distinguish between.
The default values are: `txt`, `pdf`, `md`, `docx`, and `html`.
If no value is provided, the value created by default comprises: `txt`, `pdf`, `md`, `docx`, and `html`.
Lists with duplicate elements are not allowed.
"""
if supported_types is None:
supported_types = DEFAULT_TYPES
if len(set(supported_types)) != len(supported_types):
duplicates = supported_types
for item in set(supported_types):

View File

@ -137,7 +137,7 @@ class BaseConverter(BaseComponent):
file_paths: Union[Path, List[Path]],
meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None,
remove_numeric_tables: Optional[bool] = None,
known_ligatures: Dict[str, str] = KNOWN_LIGATURES,
known_ligatures: Optional[Dict[str, str]] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "UTF-8",
id_hash_keys: Optional[List[str]] = None,
@ -153,12 +153,13 @@ class BaseConverter(BaseComponent):
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param known_ligatures: Some converters tends to recognize clusters of letters as ligatures, such as "" (double f).
:param known_ligatures: Some converters tend to recognize clusters of letters as ligatures, such as "" (double f).
Such ligatures however make text hard to compare with the content of other files,
which are generally ligature free. Therefore we automatically find and replace the most
common ligatures with their split counterparts. The default mapping is in
`haystack.nodes.file_converter.base.KNOWN_LIGATURES`: it is rather biased towards Latin alphabeths
but excludes all ligatures that are known to be used in IPA.
If no value is provided, this default is created and used.
You can use this parameter to provide your own set of ligatures to clean up from the documents.
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
@ -171,6 +172,8 @@ class BaseConverter(BaseComponent):
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
"""
if known_ligatures is None:
known_ligatures = KNOWN_LIGATURES
if isinstance(file_paths, Path):
file_paths = [file_paths]
@ -206,7 +209,7 @@ class BaseConverter(BaseComponent):
file_paths: Union[Path, List[Path]],
meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None,
remove_numeric_tables: Optional[bool] = None,
known_ligatures: Dict[str, str] = KNOWN_LIGATURES,
known_ligatures: Optional[Dict[str, str]] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "UTF-8",
id_hash_keys: Optional[List[str]] = None,

View File

@ -24,7 +24,7 @@ class ImageToTextConverter(BaseConverter):
def __init__(
self,
remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = ["eng"],
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None,
):
"""
@ -37,7 +37,8 @@ class ImageToTextConverter(BaseConverter):
(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html)
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text. Run the following line of code to check available language packs:
in garbled text. If no value is provided, English will be set as default.
Run the following line of code to check available language packs:
# List of available languages
print(pytesseract.get_languages(config=''))
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
@ -45,6 +46,8 @@ class ImageToTextConverter(BaseConverter):
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
"""
if valid_languages is None:
valid_languages = ["eng"]
super().__init__(
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
)

View File

@ -208,7 +208,7 @@ class PDFToTextOCRConverter(BaseConverter):
def __init__(
self,
remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = ["eng"],
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None,
):
"""
@ -223,12 +223,14 @@ class PDFToTextOCRConverter(BaseConverter):
(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
in garbled text. If no value is provided, English will be set as default.
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
"""
if valid_languages is None:
valid_languages = ["eng"]
# init image to text instance
self.image_2_text = ImageToTextConverter(remove_numeric_tables, valid_languages)

View File

@ -21,7 +21,7 @@ class BasePreProcessor(BaseComponent):
clean_whitespace: Optional[bool] = True,
clean_header_footer: Optional[bool] = False,
clean_empty_lines: Optional[bool] = True,
remove_substrings: List[str] = [],
remove_substrings: Optional[List[str]] = None,
split_by: Literal["word", "sentence", "passage", None] = "word",
split_length: Optional[int] = 1000,
split_overlap: Optional[int] = None,
@ -41,7 +41,7 @@ class BasePreProcessor(BaseComponent):
clean_whitespace: bool,
clean_header_footer: bool,
clean_empty_lines: bool,
remove_substrings: List[str],
remove_substrings: Optional[List[str]],
) -> Document:
raise NotImplementedError

View File

@ -54,7 +54,7 @@ class PreProcessor(BasePreProcessor):
clean_whitespace: bool = True,
clean_header_footer: bool = False,
clean_empty_lines: bool = True,
remove_substrings: List[str] = [],
remove_substrings: Optional[List[str]] = None,
split_by: Optional[Literal["word", "sentence", "passage"]] = "word",
split_length: int = 200,
split_overlap: int = 0,
@ -73,7 +73,7 @@ class PreProcessor(BasePreProcessor):
or similar.
:param clean_whitespace: Strip whitespaces before or after each line in the text.
:param clean_empty_lines: Remove more than two empty lines in the text.
:param remove_substrings: Remove specified substrings from the text.
:param remove_substrings: Remove specified substrings from the text. If no value is provided an empty list is created by default.
:param split_by: Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting.
:param split_length: Max. number of the above split unit (e.g. words) that are allowed in one document. For instance, if n -> 10 & split_by ->
"sentence", then each output document will have 10 sentences.
@ -100,6 +100,8 @@ class PreProcessor(BasePreProcessor):
`AzureConverter`.
:param max_chars_check: the maximum length a document is expected to have. Each document that is longer than max_chars_check in characters after pre-processing will raise a warning.
"""
if remove_substrings is None:
remove_substrings = []
super().__init__()
try:
@ -132,7 +134,7 @@ class PreProcessor(BasePreProcessor):
clean_whitespace: Optional[bool] = None,
clean_header_footer: Optional[bool] = None,
clean_empty_lines: Optional[bool] = None,
remove_substrings: List[str] = [],
remove_substrings: Optional[List[str]] = None,
split_by: Optional[Literal["word", "sentence", "passage"]] = None,
split_length: Optional[int] = None,
split_overlap: Optional[int] = None,
@ -143,6 +145,8 @@ class PreProcessor(BasePreProcessor):
"""
Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents.
"""
if remove_substrings is None:
remove_substrings = []
if not isinstance(documents, list):
warnings.warn(
"Using a single Document as argument to the 'documents' parameter is deprecated. Use a list "
@ -197,14 +201,15 @@ class PreProcessor(BasePreProcessor):
clean_whitespace: Optional[bool] = None,
clean_header_footer: Optional[bool] = None,
clean_empty_lines: Optional[bool] = None,
remove_substrings: List[str] = [],
remove_substrings: Optional[List[str]] = None,
split_by: Optional[Literal["word", "sentence", "passage"]] = None,
split_length: Optional[int] = None,
split_overlap: Optional[int] = None,
split_respect_sentence_boundary: Optional[bool] = None,
id_hash_keys: Optional[List[str]] = None,
) -> List[Document]:
if remove_substrings is None:
remove_substrings = []
if clean_whitespace is None:
clean_whitespace = self.clean_whitespace
if clean_header_footer is None:
@ -258,13 +263,15 @@ class PreProcessor(BasePreProcessor):
clean_whitespace: bool,
clean_header_footer: bool,
clean_empty_lines: bool,
remove_substrings: List[str],
remove_substrings: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None,
) -> Document:
"""
Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__().
"""
if remove_substrings is None:
remove_substrings = []
if id_hash_keys is None:
id_hash_keys = self.id_hash_keys

View File

@ -67,7 +67,7 @@ class TransformersQueryClassifier(BaseQueryClassifier):
tokenizer: Optional[str] = None,
use_gpu: bool = True,
task: str = "text-classification",
labels: List[str] = DEFAULT_LABELS,
labels: Optional[List[str]] = None,
batch_size: int = 16,
progress_bar: bool = True,
use_auth_token: Optional[Union[str, bool]] = None,
@ -96,6 +96,8 @@ class TransformersQueryClassifier(BaseQueryClassifier):
[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
parameter is not used and a single cpu device is used for inference.
"""
if labels is None:
labels = DEFAULT_LABELS
super().__init__()
resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
if len(resolved_devices) > 1:

View File

@ -176,7 +176,7 @@ class FARMReader(BaseReader):
dev_filename: Optional[str] = None,
test_filename: Optional[str] = None,
use_gpu: Optional[bool] = None,
devices: List[torch.device] = [],
devices: Optional[List[torch.device]] = None,
batch_size: int = 10,
n_epochs: int = 2,
learning_rate: float = 1e-5,
@ -205,6 +205,8 @@ class FARMReader(BaseReader):
doc_stride: Optional[int] = None,
max_query_length: Optional[int] = None,
):
if devices is None:
devices = []
if dev_filename:
dev_split = 0
@ -363,7 +365,7 @@ class FARMReader(BaseReader):
dev_filename: Optional[str] = None,
test_filename: Optional[str] = None,
use_gpu: Optional[bool] = None,
devices: List[torch.device] = [],
devices: Optional[List[torch.device]] = None,
batch_size: int = 10,
n_epochs: int = 2,
learning_rate: float = 1e-5,
@ -469,7 +471,7 @@ class FARMReader(BaseReader):
dev_filename: Optional[str] = None,
test_filename: Optional[str] = None,
use_gpu: Optional[bool] = None,
devices: List[torch.device] = [],
devices: Optional[List[torch.device]] = None,
batch_size: int = 10,
teacher_batch_size: Optional[int] = None,
n_epochs: int = 2,
@ -595,7 +597,7 @@ class FARMReader(BaseReader):
dev_filename: Optional[str] = None,
test_filename: Optional[str] = None,
use_gpu: Optional[bool] = None,
devices: List[torch.device] = [],
devices: Optional[List[torch.device]] = None,
batch_size: int = 10,
teacher_batch_size: Optional[int] = None,
n_epochs: int = 5,

View File

@ -794,7 +794,7 @@ class TableTextRetriever(DenseRetriever):
top_k: int = 10,
use_gpu: bool = True,
batch_size: int = 16,
embed_meta_fields: List[str] = ["name", "section_title", "caption"],
embed_meta_fields: Optional[List[str]] = None,
use_fast_tokenizers: bool = True,
similarity_function: str = "dot_product",
global_loss_buffer_size: int = 150000,
@ -825,7 +825,8 @@ class TableTextRetriever(DenseRetriever):
then used to create the embedding.
This is the approach used in the original paper and is likely to improve
performance if your titles contain meaningful information for retrieval
(topic, entities etc.).
(topic, entities etc.). If no value is provided, a default will be created.
That default embeds name, section title and caption.
:param use_fast_tokenizers: Whether to use fast Rust tokenizers
:param similarity_function: Which function to apply for calculating the similarity of query and passage embeddings during training.
Options: `dot_product` (Default) or `cosine`
@ -849,6 +850,8 @@ class TableTextRetriever(DenseRetriever):
Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
:param use_fast: Whether to use the fast version of DPR tokenizers or fallback to the standard version. Defaults to True.
"""
if embed_meta_fields is None:
embed_meta_fields = ["name", "section_title", "caption"]
super().__init__()
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=True)
@ -1225,7 +1228,7 @@ class TableTextRetriever(DenseRetriever):
max_processes: int = 128,
dev_split: float = 0,
batch_size: int = 2,
embed_meta_fields: List[str] = ["page_title", "section_title", "caption"],
embed_meta_fields: Optional[List[str]] = None,
num_hard_negatives: int = 1,
num_positives: int = 1,
n_epochs: int = 3,
@ -1260,7 +1263,7 @@ class TableTextRetriever(DenseRetriever):
:param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None.
:param batch_size: Total number of samples in 1 batch of data.
:param embed_meta_fields: Concatenate meta fields with each passage and table.
The default setting in official MMRetrieval embeds page title,
If no value is provided, a default will be created. That default embeds page title,
section title and caption with the corresponding table and title with
corresponding text passage.
:param num_hard_negatives: Number of hard negative passages (passages which are
@ -1290,6 +1293,8 @@ class TableTextRetriever(DenseRetriever):
:param checkpoints_to_keep: The maximum number of train checkpoints to save.
:param early_stopping: An initialized EarlyStopping object to control early stopping and saving of the best models.
"""
if embed_meta_fields is None:
embed_meta_fields = ["page_title", "section_title", "caption"]
self.processor.embed_meta_fields = embed_meta_fields
self.processor.data_dir = Path(data_dir)
@ -1393,7 +1398,7 @@ class TableTextRetriever(DenseRetriever):
max_seq_len_table: int = 256,
use_gpu: bool = True,
batch_size: int = 16,
embed_meta_fields: List[str] = ["name", "section_title", "caption"],
embed_meta_fields: Optional[List[str]] = None,
use_fast_tokenizers: bool = True,
similarity_function: str = "dot_product",
query_encoder_dir: str = "query_encoder",
@ -1403,6 +1408,8 @@ class TableTextRetriever(DenseRetriever):
"""
Load TableTextRetriever from the specified directory.
"""
if embed_meta_fields is None:
embed_meta_fields = ["name", "section_title", "caption"]
load_dir = Path(load_dir)
mm_retriever = cls(
@ -1441,7 +1448,7 @@ class EmbeddingRetriever(DenseRetriever):
devices: Optional[List[Union[str, torch.device]]] = None,
use_auth_token: Optional[Union[str, bool]] = None,
scale_score: bool = True,
embed_meta_fields: List[str] = [],
embed_meta_fields: Optional[List[str]] = None,
api_key: Optional[str] = None,
):
"""
@ -1494,10 +1501,13 @@ class EmbeddingRetriever(DenseRetriever):
This approach is also used in the TableTextRetriever paper and is likely to improve
performance if your titles contain meaningful information for retrieval
(topic, entities etc.).
If no value is provided, a default empty list will be created.
:param api_key: The OpenAI API key or the Cohere API key. Required if one wants to use OpenAI/Cohere embeddings.
For more details see https://beta.openai.com/account/api-keys and https://dashboard.cohere.ai/api-keys
"""
if embed_meta_fields is None:
embed_meta_fields = []
super().__init__()
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=True)
@ -1929,7 +1939,7 @@ class MultihopEmbeddingRetriever(EmbeddingRetriever):
devices: Optional[List[Union[str, torch.device]]] = None,
use_auth_token: Optional[Union[str, bool]] = None,
scale_score: bool = True,
embed_meta_fields: List[str] = [],
embed_meta_fields: Optional[List[str]] = None,
):
"""
:param document_store: An instance of DocumentStore from which to retrieve documents.
@ -1977,7 +1987,10 @@ class MultihopEmbeddingRetriever(EmbeddingRetriever):
This approach is also used in the TableTextRetriever paper and is likely to improve
performance if your titles contain meaningful information for retrieval
(topic, entities etc.).
If no value is provided, a default empty list will be created.
"""
if embed_meta_fields is None:
embed_meta_fields = []
super().__init__(
embedding_model=embedding_model,
document_store=document_store,

View File

@ -44,7 +44,7 @@ class MultiModalEmbedder:
embedding_models: Dict[str, Union[Path, str]], # replace str with ContentTypes starting from Python3.8
feature_extractors_params: Optional[Dict[str, Dict[str, Any]]] = None,
batch_size: int = 16,
embed_meta_fields: List[str] = ["name"],
embed_meta_fields: Optional[List[str]] = None,
progress_bar: bool = True,
devices: Optional[List[Union[str, torch.device]]] = None,
use_auth_token: Optional[Union[str, bool]] = None,
@ -67,6 +67,7 @@ class MultiModalEmbedder:
This is the approach used in the original paper and is likely to improve
performance if your titles contain meaningful information for retrieval
(topic, entities etc.).
If no value is provided, a default with "name" as embedding field is created.
:param progress_bar: Whether to show a tqdm progress bar or not.
Can be helpful to disable in production deployments to keep the logs clean.
:param devices: List of GPU (or CPU) devices to limit inference to certain GPUs and not use all available ones.
@ -78,6 +79,8 @@ class MultiModalEmbedder:
the local token is used, which must be previously created using `transformer-cli login`.
For more information, see [Hugging Face documentation](https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained)
"""
if embed_meta_fields is None:
embed_meta_fields = ["name"]
super().__init__()
self.devices = get_devices(devices)

View File

@ -22,11 +22,11 @@ class MultiModalRetriever(DenseRetriever):
query_embedding_model: Union[Path, str],
document_embedding_models: Dict[str, Union[Path, str]], # Replace str with ContentTypes starting Python3.8
query_type: str = "text", # Replace str with ContentTypes starting Python3.8
query_feature_extractor_params: Dict[str, Any] = {"max_length": 64},
document_feature_extractors_params: Dict[str, Dict[str, Any]] = {"text": {"max_length": 256}},
query_feature_extractor_params: Optional[Dict[str, Any]] = None,
document_feature_extractors_params: Optional[Dict[str, Dict[str, Any]]] = None,
top_k: int = 10,
batch_size: int = 16,
embed_meta_fields: List[str] = ["name"],
embed_meta_fields: Optional[List[str]] = None,
similarity_function: str = "dot_product",
progress_bar: bool = True,
devices: Optional[List[Union[str, torch.device]]] = None,
@ -46,14 +46,14 @@ class MultiModalRetriever(DenseRetriever):
checkpoint with the content type it should handle ("text", "table", "image", and so on).
The format equals the one used by Hugging Face transformers' modelhub models.
:param query_type: The content type of the query ("text", "image" and so on).
:param query_feature_extraction_params: The parameters to pass to the feature extractor of the query.
:param document_feature_extraction_params: The parameters to pass to the feature extractor of the documents.
:param query_feature_extraction_params: The parameters to pass to the feature extractor of the query. If no value is provided, a default dictionary with "max_length": 64 will be set.
:param document_feature_extraction_params: The parameters to pass to the feature extractor of the documents. If no value is provided, a default dictionary with "text": {"max_length": 256} will be set.
:param top_k: How many documents to return per query.
:param batch_size: Number of questions or documents to encode at once. For multiple GPUs, this is
the total batch size.
:param embed_meta_fields: Concatenate the provided meta fields to a (text) pair that is then used to create
the embedding. This is likely to improve performance if your titles contain meaningful information
for retrieval (topic, entities, and so on). Note that only text and table documents support this feature.
for retrieval (topic, entities, and so on). Note that only text and table documents support this feature. If no values is provided, a default with "name" as embedding field will be created.
:param similarity_function: Which function to apply for calculating the similarity of query and document
embeddings during training. Options: `dot_product` (default) or `cosine`.
:param progress_bar: Whether to show a tqdm progress bar or not.
@ -72,6 +72,12 @@ class MultiModalRetriever(DenseRetriever):
range are scaled to a range of [0,1], where 1 means extremely relevant.
Otherwise raw similarity scores (for example, cosine or dot_product) are used.
"""
if query_feature_extractor_params is None:
query_feature_extractor_params = {"max_length": 64}
if document_feature_extractors_params is None:
document_feature_extractors_params = {"text": {"max_length": 256}}
if embed_meta_fields is None:
embed_meta_fields = ["name"]
super().__init__()
self.similarity_function = similarity_function

View File

@ -745,12 +745,12 @@ class Pipeline:
cls,
index_pipeline: Pipeline,
query_pipeline: Pipeline,
index_params: dict = {},
query_params: dict = {},
index_params: Optional[Dict] = None,
query_params: Optional[Dict] = None,
dataset: str = "scifact",
dataset_dir: Path = Path("."),
num_documents: Optional[int] = None,
top_k_values: List[int] = [1, 3, 5, 10, 100, 1000],
top_k_values: Optional[List[int]] = None,
keep_index: bool = False,
) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float]]:
"""
@ -765,7 +765,7 @@ class Pipeline:
:param dataset_dir: The directory to store the dataset to.
:param num_documents: Maximum number of documents to load from given dataset. If set to None (default)
or to a value larger than the number of documents in the dataset, the full dataset is loaded.
:param top_k_values: The top_k values each metric will be calculated for.
:param top_k_values: The top_k values each metric will be calculated for. By default, the values are 1, 3, 5, 10, 100, and 1000.
:param keep_index: Whether to keep the index after evaluation.
If True the index will be kept after beir evaluation. Otherwise it will be deleted immediately afterwards.
Defaults to False.
@ -773,6 +773,12 @@ class Pipeline:
Returns a tuple containing the ncdg, map, recall and precision scores.
Each metric is represented by a dictionary containing the scores for each top_k value.
"""
if index_params is None:
index_params = {}
if query_params is None:
query_params = {}
if top_k_values is None:
top_k_values = [1, 3, 5, 10, 100, 1000]
try:
from beir import util
from beir.datasets.data_loader import GenericDataLoader
@ -855,11 +861,11 @@ class Pipeline:
experiment_tracking_tool: Literal["mlflow", None] = None,
experiment_tracking_uri: Optional[str] = None,
corpus_file_metas: Optional[List[Dict[str, Any]]] = None,
corpus_meta: Dict[str, Any] = {},
evaluation_set_meta: Dict[str, Any] = {},
pipeline_meta: Dict[str, Any] = {},
index_params: dict = {},
query_params: dict = {},
corpus_meta: Optional[Dict[str, Any]] = None,
evaluation_set_meta: Optional[Dict[str, Any]] = None,
pipeline_meta: Optional[Dict[str, Any]] = None,
index_params: Optional[Dict] = None,
query_params: Optional[Dict] = None,
sas_model_name_or_path: Optional[str] = None,
sas_batch_size: int = 32,
sas_use_gpu: bool = True,
@ -997,6 +1003,17 @@ class Pipeline:
Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scoring ~75 in total.
:param context_matching_threshold: Score threshold that candidates must surpass to be included into the result list. Range: [0,100]
"""
if corpus_meta is None:
corpus_meta = {}
if evaluation_set_meta is None:
evaluation_set_meta = {}
if pipeline_meta is None:
pipeline_meta = {}
if index_params is None:
index_params = {}
if query_params is None:
query_params = {}
if experiment_tracking_tool is not None:
tracking_head_cls = TRACKING_TOOL_TO_HEAD.get(experiment_tracking_tool, None)
if tracking_head_cls is None:
@ -2213,7 +2230,7 @@ class Pipeline:
"document_id_or_answer",
] = "document_id_or_answer",
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
wrong_examples_fields: List[str] = ["answer", "context", "document_id"],
wrong_examples_fields: Optional[List[str]] = None,
max_characters_per_field: int = 150,
):
"""
@ -2249,9 +2266,11 @@ class Pipeline:
- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
The default value is 'any'.
In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
:param wrong_examples_fields: A list of fields to include in the worst samples.
:param wrong_examples_fields: A list of fields to include in the worst samples. By default, "answer", "context", and "document_id" are included.
:param max_characters_per_field: The maximum number of characters to include in the worst samples report (per field).
"""
if wrong_examples_fields is None:
wrong_examples_fields = ["answer", "context", "document_id"]
graph = DiGraph(self.graph.edges)
print_eval_report(
eval_result=eval_result,

View File

@ -202,7 +202,7 @@ class RayPipeline(Pipeline):
@classmethod
def _create_ray_deployment(
cls, component_name: str, pipeline_config: dict, serve_deployment_kwargs: Optional[Dict[str, Any]] = {}
cls, component_name: str, pipeline_config: dict, serve_deployment_kwargs: Optional[Dict[str, Any]] = None
):
"""
Create a Ray Deployment for the Component.
@ -215,6 +215,8 @@ class RayPipeline(Pipeline):
Ray Serve API docs (https://docs.ray.io/en/latest/serve/package-ref.html)
under the `ray.serve.deployment()` method
"""
if serve_deployment_kwargs is None:
serve_deployment_kwargs = {}
RayDeployment = serve.deployment(
_RayDeploymentWrapper, name=component_name, **serve_deployment_kwargs # type: ignore
)

View File

@ -241,7 +241,7 @@ class BaseStandardPipeline(ABC):
"document_id_or_answer",
] = "document_id_or_answer",
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
wrong_examples_fields: List[str] = ["answer", "context", "document_id"],
wrong_examples_fields: Optional[List[str]] = None,
max_characters_per_field: int = 150,
):
"""
@ -277,9 +277,11 @@ class BaseStandardPipeline(ABC):
- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
The default value is 'any'.
In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
:param wrong_examples_fields: A list of field names to include in the worst samples.
:param wrong_examples_fields: A list of field names to include in the worst samples. By default, "answer", "context", and "document_id" are used.
:param max_characters_per_field: The maximum number of characters per wrong example to show (per field).
"""
if wrong_examples_fields is None:
wrong_examples_fields = ["answer", "context", "document_id"]
if metrics_filter is None:
metrics_filter = self.metrics_filter
self.pipeline.print_eval_report(

View File

@ -178,7 +178,7 @@ def print_eval_report(
"document_id", "context", "document_id_and_context", "document_id_or_context", "answer", "document_id_or_answer"
] = "document_id_or_answer",
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
wrong_examples_fields: List[str] = ["answer", "context", "document_id"],
wrong_examples_fields: Optional[List[str]] = None,
max_characters_per_field: int = 150,
):
"""
@ -216,9 +216,11 @@ def print_eval_report(
- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
The default value is 'any'.
In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
:param wrong_examples_fields: A list of field names that should be included in the wrong examples.
:param wrong_examples_fields: A list of field names that should be included in the wrong examples. By default, "answer", "context", and "document_id" are used.
:param max_characters_per_field: The maximum number of characters to show in the wrong examples report (per field).
"""
if wrong_examples_fields is None:
wrong_examples_fields = ["answer", "context", "document_id"]
if any(degree > 1 for node, degree in graph.out_degree):
logger.warning("Pipelines with junctions are currently not supported.")
return
@ -309,9 +311,11 @@ def _format_wrong_examples_report(
"document_id", "context", "document_id_and_context", "document_id_or_context", "answer", "document_id_or_answer"
] = "document_id_or_answer",
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
fields: List[str] = ["answer", "context", "document_id"],
fields: Optional[List[str]] = None,
max_chars: int = 150,
):
if fields is None:
fields = ["answer", "context", "document_id"]
examples = {
node: eval_result.wrong_examples(
node, document_scope=document_scope, answer_scope=answer_scope, n=n_wrong_examples

View File

@ -302,7 +302,9 @@ class SpeechDocument(Document):
return f"<SpeechDocument: id={self.id}, content=None>"
return f"<SpeechDocument: id={self.id}, content='{self.content[:100]}{'...' if len(self.content) > 100 else ''}', content_audio={self.content_audio}>"
def to_dict(self, field_map={}) -> Dict:
def to_dict(self, field_map=None) -> Dict:
if field_map is None:
field_map = {}
dictionary = super().to_dict(field_map=field_map)
for key, value in dictionary.items():
if isinstance(value, Path):
@ -310,7 +312,9 @@ class SpeechDocument(Document):
return dictionary
@classmethod
def from_dict(cls, dict, field_map={}, id_hash_keys=None):
def from_dict(cls, dict, field_map=None, id_hash_keys=None):
if field_map is None:
field_map = {}
doc = super().from_dict(dict=dict, field_map=field_map, id_hash_keys=id_hash_keys)
doc.content_audio = Path(dict["content_audio"])
return doc

View File

@ -133,7 +133,7 @@ def send_event(func):
return wrapper
def send_custom_event(event: str = "", payload: Dict[str, Any] = {}):
def send_custom_event(event: str = "", payload: Optional[Dict[str, Any]] = None):
"""
This method can be called directly from anywhere in Haystack to send an event.
Enriches the given event with metadata and sends it to the posthog server if telemetry is enabled.
@ -143,6 +143,8 @@ def send_custom_event(event: str = "", payload: Dict[str, Any] = {}):
:param payload: A dictionary containing event meta data, e.g., parameter settings
"""
global user_id # pylint: disable=global-statement
if payload is None:
payload = {}
try:
def send_request(payload: Dict[str, Any]):

View File

@ -144,7 +144,7 @@ class DeepsetCloudClient:
def post(
self,
url: str,
json: dict = {},
json: Optional[Dict] = None,
data: Optional[Any] = None,
query_params: Optional[dict] = None,
headers: Optional[dict] = None,
@ -152,6 +152,8 @@ class DeepsetCloudClient:
files: Optional[Any] = None,
raise_on_error: bool = True,
):
if json is None:
json = {}
return self._execute_request(
method="POST",
url=url,
@ -167,7 +169,7 @@ class DeepsetCloudClient:
def post_with_auto_paging(
self,
url: str,
json: dict = {},
json: Optional[Dict] = None,
data: Optional[Any] = None,
query_params: Optional[dict] = None,
headers: Optional[dict] = None,
@ -175,6 +177,8 @@ class DeepsetCloudClient:
raise_on_error: bool = True,
auto_paging_page_size: Optional[int] = None,
):
if json is None:
json = {}
return self._execute_auto_paging_request(
method="POST",
url=url,
@ -211,7 +215,7 @@ class DeepsetCloudClient:
def put_with_auto_paging(
self,
url: str,
json: dict = {},
json: Optional[Dict] = None,
data: Optional[Any] = None,
query_params: Optional[dict] = None,
headers: Optional[dict] = None,
@ -219,6 +223,8 @@ class DeepsetCloudClient:
raise_on_error: bool = True,
auto_paging_page_size: Optional[int] = None,
):
if json is None:
json = {}
return self._execute_auto_paging_request(
method="PUT",
url=url,

View File

@ -278,7 +278,6 @@ disable = [
"unspecified-encoding",
"unidiomatic-typecheck",
"no-name-in-module",
"dangerous-default-value",
"consider-using-with",
"redefined-outer-name",
"arguments-renamed",