diff --git a/haystack/document_stores/memory.py b/haystack/document_stores/memory.py index 195f9a1ce..418b1bbeb 100644 --- a/haystack/document_stores/memory.py +++ b/haystack/document_stores/memory.py @@ -49,7 +49,7 @@ class InMemoryDocumentStore(KeywordDocumentStore): use_bm25: bool = False, bm25_tokenization_regex: str = r"(?u)\b\w\w+\b", bm25_algorithm: Literal["BM25Okapi", "BM25L", "BM25Plus"] = "BM25Okapi", - bm25_parameters: dict = {}, + bm25_parameters: Optional[Dict] = None, ): """ :param index: The documents are scoped to an index attribute that can be used when writing, querying, @@ -87,7 +87,10 @@ class InMemoryDocumentStore(KeywordDocumentStore): :param bm25_parameters: Parameters for BM25 implementation in a dictionary format. For example: {'k1':1.5, 'b':0.75, 'epsilon':0.25} You can learn more about these parameters by visiting https://github.com/dorianbrown/rank_bm25 + By default, no parameters are set. """ + if bm25_parameters is None: + bm25_parameters = {} super().__init__() self.indexes: Dict[str, Dict] = defaultdict(dict) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index bb6833946..5b6c211ad 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -68,7 +68,7 @@ class PineconeDocumentStore(BaseDocumentStore): progress_bar: bool = True, duplicate_documents: str = "overwrite", recreate_index: bool = False, - metadata_config: dict = {"indexed": []}, + metadata_config: Optional[Dict] = None, validate_index_sync: bool = True, ): """ @@ -106,6 +106,8 @@ class PineconeDocumentStore(BaseDocumentStore): Should be in the format `{"indexed": ["metadata-field-1", "metadata-field-2", "metadata-field-n"]}`. By default, no fields are indexed. """ + if metadata_config is None: + metadata_config = {"indexed": []} # Connect to Pinecone server using python client binding if not api_key: raise PineconeDocumentStoreError( @@ -201,12 +203,14 @@ class PineconeDocumentStore(BaseDocumentStore): replicas: Optional[int] = 1, shards: Optional[int] = 1, recreate_index: bool = False, - metadata_config: dict = {"indexed": []}, + metadata_config: Optional[Dict] = None, ): """ Create a new index for storing documents in case an index with the name doesn't exist already. """ + if metadata_config is None: + metadata_config = {"indexed": []} index = self._index_name(index) if recreate_index: diff --git a/haystack/modeling/data_handler/data_silo.py b/haystack/modeling/data_handler/data_silo.py index 5ce6bb6c0..c8c829b9d 100644 --- a/haystack/modeling/data_handler/data_silo.py +++ b/haystack/modeling/data_handler/data_silo.py @@ -555,7 +555,7 @@ class DataSiloForCrossVal: def make( cls, datasilo: DataSilo, - sets: List[str] = ["train", "dev", "test"], + sets: Optional[List[str]] = None, n_splits: int = 5, shuffle: bool = True, random_state: Optional[int] = None, @@ -568,7 +568,7 @@ class DataSiloForCrossVal: original data silo passed on. :param datasilo: The data silo that contains the original data. - :param sets: Which sets to use to create the xval folds (strings) + :param sets: Which sets to use to create the xval folds (strings). By default, "train", "dev", and "test" are used. :param n_splits: number of folds to create :param shuffle: shuffle each class' samples before splitting :param random_state: random state for shuffling @@ -576,6 +576,8 @@ class DataSiloForCrossVal: It is never done with question answering. :param n_neg_answers_per_question: number of negative answers per question to include for training """ + if sets is None: + sets = ["train", "dev", "test"] if "question_answering" in datasilo.processor.tasks and n_inner_splits is None: # type: ignore return cls._make_question_answering( datasilo, sets, n_splits, shuffle, random_state, n_neg_answers_per_question @@ -588,7 +590,7 @@ class DataSiloForCrossVal: def _make_question_answering( cls, datasilo: DataSilo, - sets: List[str] = ["train", "dev", "test"], + sets: Optional[List[str]] = None, n_splits: int = 5, shuffle: bool = True, random_state: Optional[int] = None, @@ -600,12 +602,14 @@ class DataSiloForCrossVal: data for question-answering- :param datasilo: The data silo that contains the original data. - :param sets: Which sets to use to create the xval folds (strings). + :param sets: Which sets to use to create the xval folds (strings). By default, "train", "dev", and "test" are used. :param n_splits: Number of folds to create. :param shuffle: Shuffle each class' samples before splitting. :param random_state: Random state for shuffling. :param n_neg_answers_per_question: Number of negative answers per question to include for training. """ + if sets is None: + sets = ["train", "dev", "test"] assert "id" in datasilo.tensor_names, f"Expected tensor 'id' in tensor names, found {datasilo.tensor_names}" # type: ignore assert "labels" in datasilo.tensor_names, f"Expected tensor 'labels' in tensor names, found {datasilo.tensor_names}" # type: ignore diff --git a/haystack/modeling/data_handler/processor.py b/haystack/modeling/data_handler/processor.py index 004b3eb92..c33257988 100644 --- a/haystack/modeling/data_handler/processor.py +++ b/haystack/modeling/data_handler/processor.py @@ -59,7 +59,7 @@ class Processor(ABC): test_filename: Optional[Union[Path, str]], dev_split: float, data_dir: Optional[Union[Path, str]], - tasks: Dict = {}, + tasks: Optional[Dict] = None, proxies: Optional[Dict] = None, multithreading_rust: Optional[bool] = True, ): @@ -82,6 +82,8 @@ class Processor(ABC): Note: Enabling multithreading in Rust AND multiprocessing in python might cause deadlocks. """ + if tasks is None: + tasks = {} if not multithreading_rust: os.environ["RAYON_RS_NUM_CPUS"] = "1" @@ -313,7 +315,7 @@ class Processor(ABC): @abstractmethod def dataset_from_dicts( - self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False + self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False ): raise NotImplementedError() @@ -444,7 +446,7 @@ class SquadProcessor(Processor): ) def dataset_from_dicts( - self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False + self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False ): """ Convert input dictionaries into a pytorch dataset for Question Answering. @@ -456,6 +458,8 @@ class SquadProcessor(Processor): :param indices: list, indices used during multiprocessing so that IDs assigned to our baskets is unique :param return_baskets: boolean, whether to return the baskets or not (baskets are needed during inference) """ + if indices is None: + indices = [] # Convert to standard format pre_baskets = [self.convert_qa_input_dict(x) for x in dicts] # TODO move to input object conversion @@ -990,7 +994,7 @@ class TextSimilarityProcessor(Processor): json.dump(config, file) def dataset_from_dicts( - self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False + self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False ): """ Convert input dictionaries into a pytorch dataset for TextSimilarity (e.g. DPR). @@ -1013,6 +1017,8 @@ class TextSimilarityProcessor(Processor): :param return_baskets: whether to return the baskets or not (baskets are needed during inference) :return: dataset, tensor_names, problematic_ids, [baskets] """ + if indices is None: + indices = [] # Take the dict and insert into our basket structure, this stages also adds an internal IDs baskets = self._fill_baskets(dicts, indices) @@ -1254,7 +1260,7 @@ class TableTextSimilarityProcessor(Processor): dev_split: float = 0.1, proxies: Optional[Dict] = None, max_samples: Optional[int] = None, - embed_meta_fields: List[str] = ["page_title", "section_title", "caption"], + embed_meta_fields: Optional[List[str]] = None, num_positives: int = 1, num_hard_negatives: int = 1, shuffle_negatives: bool = True, @@ -1284,7 +1290,7 @@ class TableTextSimilarityProcessor(Processor): :param proxies: Proxy configuration to allow downloads of remote datasets. Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies :param max_samples: maximum number of samples to use. - :param embed_meta_fields: List of meta fields to embed in text passages and tables during tensorization. + :param embed_meta_fields: List of meta fields to embed in text passages and tables during tensorization. By default, "page_title", "section_title", and "caption" are used. :param num_hard_negatives: Maximum number of hard negative context passages in a sample. :param num_positives: Maximum number of positive context passages in a sample. :param shuffle_negatives: Whether to shuffle all the hard_negative passages before selecting the @@ -1296,6 +1302,8 @@ class TableTextSimilarityProcessor(Processor): """ # TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs + if embed_meta_fields is None: + embed_meta_fields = ["page_title", "section_title", "caption"] # Custom processor attributes self.max_samples = max_samples self.query_tokenizer = query_tokenizer @@ -1511,7 +1519,7 @@ class TableTextSimilarityProcessor(Processor): return standard_dicts def dataset_from_dicts( - self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False + self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False ): """ Convert input dictionaries into a pytorch dataset for TextSimilarity. @@ -1533,7 +1541,8 @@ class TableTextSimilarityProcessor(Processor): :param indices: list, indices used during multiprocessing so that IDs assigned to our baskets is unique :param return_baskets: boolean, whether to return the baskets or not (baskets are needed during inference) """ - + if indices is None: + indices = [] # Take the dict and insert into our basket structure, this stages also adds an internal IDs baskets = self._fill_baskets(dicts, indices) @@ -1861,8 +1870,10 @@ class TextClassificationProcessor(Processor): raise NotImplementedError def dataset_from_dicts( - self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False + self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False ): + if indices is None: + indices = [] baskets = [] # Tokenize in batches texts = [x["text"] for x in dicts] @@ -2043,10 +2054,12 @@ class UnlabeledTextProcessor(Processor): test_filename: Optional[Union[Path, str]] = None, dev_split: float = 0, data_dir: Optional[Union[Path, str]] = None, - tasks: Dict = {}, + tasks: Optional[Dict] = None, proxies: Optional[Dict] = None, multithreading_rust: Optional[bool] = True, ): + if tasks is None: + tasks = {} super().__init__( tokenizer, max_seq_len, @@ -2069,8 +2082,10 @@ class UnlabeledTextProcessor(Processor): return dicts def dataset_from_dicts( - self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False + self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False, debug: bool = False ): + if indices is None: + indices = [] if return_baskets: raise NotImplementedError("return_baskets is not supported by UnlabeledTextProcessor") texts = [dict_["text"] for dict_ in dicts] diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index c0f125441..6e47db69f 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -42,8 +42,8 @@ class BiAdaptiveModel(nn.Module): prediction_heads: List[PredictionHead], embeds_dropout_prob: float = 0.1, device: torch.device = torch.device("cuda"), - lm1_output_types: Union[str, List[str]] = ["per_sequence"], - lm2_output_types: Union[str, List[str]] = ["per_sequence"], + lm1_output_types: Optional[Union[str, List[str]]] = None, + lm2_output_types: Optional[Union[str, List[str]]] = None, loss_aggregation_fn: Optional[Callable] = None, ): """ @@ -54,12 +54,12 @@ class BiAdaptiveModel(nn.Module): language models will be zeroed. :param lm1_output_types: How to extract the embeddings from the final layer of the first language model. When set to "per_token", one embedding will be extracted per input token. If set to - "per_sequence", a single embedding will be extracted to represent the full + "per_sequence" (default), a single embedding will be extracted to represent the full input sequence. Can either be a single string, or a list of strings, one for each prediction head. :param lm2_output_types: How to extract the embeddings from the final layer of the second language model. When set to "per_token", one embedding will be extracted per input token. If set to - "per_sequence", a single embedding will be extracted to represent the full + "per_sequence" (default), a single embedding will be extracted to represent the full input sequence. Can either be a single string, or a list of strings, one for each prediction head. :param device: The device on which this model will operate. Either torch.device("cpu") or torch.device("cuda"). @@ -74,6 +74,10 @@ class BiAdaptiveModel(nn.Module): Note: The loss at this stage is per sample, i.e one tensor of shape (batchsize) per prediction head. """ + if lm1_output_types is None: + lm1_output_types = ["per_sequence"] + if lm2_output_types is None: + lm2_output_types = ["per_sequence"] super(BiAdaptiveModel, self).__init__() self.device = device diff --git a/haystack/modeling/model/prediction_head.py b/haystack/modeling/model/prediction_head.py index 6b05dca53..5d19c6694 100644 --- a/haystack/modeling/model/prediction_head.py +++ b/haystack/modeling/model/prediction_head.py @@ -231,7 +231,7 @@ class QuestionAnsweringHead(PredictionHead): def __init__( self, - layer_dims: List[int] = [768, 2], + layer_dims: Optional[List[int]] = None, task_name: str = "question_answering", no_ans_boost: float = 0.0, context_window_size: int = 100, @@ -244,7 +244,7 @@ class QuestionAnsweringHead(PredictionHead): **kwargs, ): """ - :param layer_dims: dimensions of Feed Forward block, e.g. [768,2], for adjusting to BERT embedding. Output should be always 2 + :param layer_dims: dimensions of Feed Forward block, e.g. [768,2] used by default, for adjusting to BERT embedding. Output should be always 2 :param kwargs: placeholder for passing generic parameters :param no_ans_boost: How much the no_answer logit is boosted/increased. The higher the value, the more likely a "no answer possible given the input text" is returned by the model @@ -260,6 +260,8 @@ class QuestionAnsweringHead(PredictionHead): :param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence. Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default). """ + if layer_dims is None: + layer_dims = [768, 2] super(QuestionAnsweringHead, self).__init__() if len(kwargs) > 0: logger.warning( diff --git a/haystack/modeling/model/predictions.py b/haystack/modeling/model/predictions.py index 15308e7cd..baa5dfc72 100644 --- a/haystack/modeling/model/predictions.py +++ b/haystack/modeling/model/predictions.py @@ -248,7 +248,7 @@ class QAPred(Pred): aggregation_level: str, no_answer_gap: float, ground_truth_answer: Optional[str] = None, - answer_types: List[str] = [], + answer_types: Optional[List[str]] = None, ): """ :param id: The id of the passage or document @@ -262,6 +262,8 @@ class QAPred(Pred): :param ground_truth_answer: Ground truth answers :param answer_types: List of answer_types supported by this task e.g. ["span", "yes_no", "no_answer"] """ + if answer_types is None: + answer_types = [] super().__init__(id, prediction, context) self.question = question self.token_offsets = token_offsets diff --git a/haystack/modeling/model/triadaptive_model.py b/haystack/modeling/model/triadaptive_model.py index 86d03de38..b0b330057 100644 --- a/haystack/modeling/model/triadaptive_model.py +++ b/haystack/modeling/model/triadaptive_model.py @@ -44,9 +44,9 @@ class TriAdaptiveModel(nn.Module): prediction_heads: List[PredictionHead], embeds_dropout_prob: float = 0.1, device: torch.device = torch.device("cuda"), - lm1_output_types: Union[str, List[str]] = ["per_sequence"], - lm2_output_types: Union[str, List[str]] = ["per_sequence"], - lm3_output_types: Union[str, List[str]] = ["per_sequence"], + lm1_output_types: Optional[Union[str, List[str]]] = None, + lm2_output_types: Optional[Union[str, List[str]]] = None, + lm3_output_types: Optional[Union[str, List[str]]] = None, loss_aggregation_fn: Optional[Callable] = None, ): """ @@ -58,17 +58,17 @@ class TriAdaptiveModel(nn.Module): language model will be zeroed. :param lm1_output_types: How to extract the embeddings from the final layer of the first language model. When set to "per_token", one embedding will be extracted per input token. If set to - "per_sequence", a single embedding will be extracted to represent the full + "per_sequence" (default), a single embedding will be extracted to represent the full input sequence. Can either be a single string, or a list of strings, one for each prediction head. :param lm2_output_types: How to extract the embeddings from the final layer of the second language model. When set to "per_token", one embedding will be extracted per input token. If set to - "per_sequence", a single embedding will be extracted to represent the full + "per_sequence" (default), a single embedding will be extracted to represent the full input sequence. Can either be a single string, or a list of strings, one for each prediction head. :param lm3_output_types: How to extract the embeddings from the final layer of the third language model. When set to "per_token", one embedding will be extracted per input token. If set to - "per_sequence", a single embedding will be extracted to represent the full + "per_sequence" (default), a single embedding will be extracted to represent the full input sequence. Can either be a single string, or a list of strings, one for each prediction head. :param device: The device on which this model will operate. Either torch.device("cpu") or torch.device("cuda"). @@ -83,7 +83,12 @@ class TriAdaptiveModel(nn.Module): Note: The loss at this stage is per sample, i.e one tensor of shape (batchsize) per prediction head. """ - + if lm1_output_types is None: + lm1_output_types = ["per_sequence"] + if lm2_output_types is None: + lm2_output_types = ["per_sequence"] + if lm3_output_types is None: + lm3_output_types = ["per_sequence"] super(TriAdaptiveModel, self).__init__() self.device = device self.language_model1 = language_model1.to(device) diff --git a/haystack/nodes/_json_schema.py b/haystack/nodes/_json_schema.py index eeae9800b..853442d7b 100644 --- a/haystack/nodes/_json_schema.py +++ b/haystack/nodes/_json_schema.py @@ -261,10 +261,12 @@ def create_schema_for_node_class(node_class: Type[BaseComponent]) -> Tuple[Dict[ return component_schema, {"$ref": f"#/definitions/{component_name}"} -def get_json_schema(filename: str, version: str, modules: List[str] = ["haystack.document_stores", "haystack.nodes"]): +def get_json_schema(filename: str, version: str, modules: Optional[List[str]] = None): """ Generate JSON schema for Haystack pipelines. """ + if modules is None: + modules = ["haystack.document_stores", "haystack.nodes"] schema_definitions = {} # All the schemas for the node and accessory classes node_refs = [] # References to the nodes only (accessory classes cannot be listed among the nodes in a config) diff --git a/haystack/nodes/file_classifier/file_type.py b/haystack/nodes/file_classifier/file_type.py index 470461efc..22a9eaa98 100644 --- a/haystack/nodes/file_classifier/file_type.py +++ b/haystack/nodes/file_classifier/file_type.py @@ -1,5 +1,5 @@ import mimetypes -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Union, Optional import logging from pathlib import Path @@ -29,14 +29,16 @@ class FileTypeClassifier(BaseComponent): outgoing_edges = len(DEFAULT_TYPES) - def __init__(self, supported_types: List[str] = DEFAULT_TYPES): + def __init__(self, supported_types: Optional[List[str]] = None): """ Node that sends out files on a different output edge depending on their extension. :param supported_types: The file types that this node can distinguish between. - The default values are: `txt`, `pdf`, `md`, `docx`, and `html`. + If no value is provided, the value created by default comprises: `txt`, `pdf`, `md`, `docx`, and `html`. Lists with duplicate elements are not allowed. """ + if supported_types is None: + supported_types = DEFAULT_TYPES if len(set(supported_types)) != len(supported_types): duplicates = supported_types for item in set(supported_types): diff --git a/haystack/nodes/file_converter/base.py b/haystack/nodes/file_converter/base.py index 02d7606f9..12411459a 100644 --- a/haystack/nodes/file_converter/base.py +++ b/haystack/nodes/file_converter/base.py @@ -137,7 +137,7 @@ class BaseConverter(BaseComponent): file_paths: Union[Path, List[Path]], meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, remove_numeric_tables: Optional[bool] = None, - known_ligatures: Dict[str, str] = KNOWN_LIGATURES, + known_ligatures: Optional[Dict[str, str]] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None, @@ -153,12 +153,13 @@ class BaseConverter(BaseComponent): does not have table parsing capability for finding answers. However, tables may also have long strings that could possible candidate for searching answers. The rows containing strings are thus retained in this option. - :param known_ligatures: Some converters tends to recognize clusters of letters as ligatures, such as "ff" (double f). + :param known_ligatures: Some converters tend to recognize clusters of letters as ligatures, such as "ff" (double f). Such ligatures however make text hard to compare with the content of other files, which are generally ligature free. Therefore we automatically find and replace the most common ligatures with their split counterparts. The default mapping is in `haystack.nodes.file_converter.base.KNOWN_LIGATURES`: it is rather biased towards Latin alphabeths but excludes all ligatures that are known to be used in IPA. + If no value is provided, this default is created and used. You can use this parameter to provide your own set of ligatures to clean up from the documents. :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 (https://en.wikipedia.org/wiki/ISO_639-1) format. @@ -171,6 +172,8 @@ class BaseConverter(BaseComponent): not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). In this case the id will be generated by using the content and the defined metadata. """ + if known_ligatures is None: + known_ligatures = KNOWN_LIGATURES if isinstance(file_paths, Path): file_paths = [file_paths] @@ -206,7 +209,7 @@ class BaseConverter(BaseComponent): file_paths: Union[Path, List[Path]], meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, remove_numeric_tables: Optional[bool] = None, - known_ligatures: Dict[str, str] = KNOWN_LIGATURES, + known_ligatures: Optional[Dict[str, str]] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None, diff --git a/haystack/nodes/file_converter/image.py b/haystack/nodes/file_converter/image.py index 927447279..1b0833894 100644 --- a/haystack/nodes/file_converter/image.py +++ b/haystack/nodes/file_converter/image.py @@ -24,7 +24,7 @@ class ImageToTextConverter(BaseConverter): def __init__( self, remove_numeric_tables: bool = False, - valid_languages: Optional[List[str]] = ["eng"], + valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, ): """ @@ -37,7 +37,8 @@ class ImageToTextConverter(BaseConverter): (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html) This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting - in garbled text. Run the following line of code to check available language packs: + in garbled text. If no value is provided, English will be set as default. + Run the following line of code to check available language packs: # List of available languages print(pytesseract.get_languages(config='')) :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's @@ -45,6 +46,8 @@ class ImageToTextConverter(BaseConverter): not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). In this case the id will be generated by using the content and the defined metadata. """ + if valid_languages is None: + valid_languages = ["eng"] super().__init__( remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys ) diff --git a/haystack/nodes/file_converter/pdf.py b/haystack/nodes/file_converter/pdf.py index c991f28e6..6594e4cfb 100644 --- a/haystack/nodes/file_converter/pdf.py +++ b/haystack/nodes/file_converter/pdf.py @@ -208,7 +208,7 @@ class PDFToTextOCRConverter(BaseConverter): def __init__( self, remove_numeric_tables: bool = False, - valid_languages: Optional[List[str]] = ["eng"], + valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, ): """ @@ -223,12 +223,14 @@ class PDFToTextOCRConverter(BaseConverter): (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting - in garbled text. + in garbled text. If no value is provided, English will be set as default. :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). In this case the id will be generated by using the content and the defined metadata. """ + if valid_languages is None: + valid_languages = ["eng"] # init image to text instance self.image_2_text = ImageToTextConverter(remove_numeric_tables, valid_languages) diff --git a/haystack/nodes/preprocessor/base.py b/haystack/nodes/preprocessor/base.py index 925502c9b..24f482255 100644 --- a/haystack/nodes/preprocessor/base.py +++ b/haystack/nodes/preprocessor/base.py @@ -21,7 +21,7 @@ class BasePreProcessor(BaseComponent): clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, - remove_substrings: List[str] = [], + remove_substrings: Optional[List[str]] = None, split_by: Literal["word", "sentence", "passage", None] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, @@ -41,7 +41,7 @@ class BasePreProcessor(BaseComponent): clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool, - remove_substrings: List[str], + remove_substrings: Optional[List[str]], ) -> Document: raise NotImplementedError diff --git a/haystack/nodes/preprocessor/preprocessor.py b/haystack/nodes/preprocessor/preprocessor.py index 97a03f3f7..311603c67 100644 --- a/haystack/nodes/preprocessor/preprocessor.py +++ b/haystack/nodes/preprocessor/preprocessor.py @@ -54,7 +54,7 @@ class PreProcessor(BasePreProcessor): clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, - remove_substrings: List[str] = [], + remove_substrings: Optional[List[str]] = None, split_by: Optional[Literal["word", "sentence", "passage"]] = "word", split_length: int = 200, split_overlap: int = 0, @@ -73,7 +73,7 @@ class PreProcessor(BasePreProcessor): or similar. :param clean_whitespace: Strip whitespaces before or after each line in the text. :param clean_empty_lines: Remove more than two empty lines in the text. - :param remove_substrings: Remove specified substrings from the text. + :param remove_substrings: Remove specified substrings from the text. If no value is provided an empty list is created by default. :param split_by: Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting. :param split_length: Max. number of the above split unit (e.g. words) that are allowed in one document. For instance, if n -> 10 & split_by -> "sentence", then each output document will have 10 sentences. @@ -100,6 +100,8 @@ class PreProcessor(BasePreProcessor): `AzureConverter`. :param max_chars_check: the maximum length a document is expected to have. Each document that is longer than max_chars_check in characters after pre-processing will raise a warning. """ + if remove_substrings is None: + remove_substrings = [] super().__init__() try: @@ -132,7 +134,7 @@ class PreProcessor(BasePreProcessor): clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, - remove_substrings: List[str] = [], + remove_substrings: Optional[List[str]] = None, split_by: Optional[Literal["word", "sentence", "passage"]] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, @@ -143,6 +145,8 @@ class PreProcessor(BasePreProcessor): """ Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents. """ + if remove_substrings is None: + remove_substrings = [] if not isinstance(documents, list): warnings.warn( "Using a single Document as argument to the 'documents' parameter is deprecated. Use a list " @@ -197,14 +201,15 @@ class PreProcessor(BasePreProcessor): clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, - remove_substrings: List[str] = [], + remove_substrings: Optional[List[str]] = None, split_by: Optional[Literal["word", "sentence", "passage"]] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, ) -> List[Document]: - + if remove_substrings is None: + remove_substrings = [] if clean_whitespace is None: clean_whitespace = self.clean_whitespace if clean_header_footer is None: @@ -258,13 +263,15 @@ class PreProcessor(BasePreProcessor): clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool, - remove_substrings: List[str], + remove_substrings: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, ) -> Document: """ Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__(). """ + if remove_substrings is None: + remove_substrings = [] if id_hash_keys is None: id_hash_keys = self.id_hash_keys diff --git a/haystack/nodes/query_classifier/transformers.py b/haystack/nodes/query_classifier/transformers.py index cccc8ad9f..c769241c2 100644 --- a/haystack/nodes/query_classifier/transformers.py +++ b/haystack/nodes/query_classifier/transformers.py @@ -67,7 +67,7 @@ class TransformersQueryClassifier(BaseQueryClassifier): tokenizer: Optional[str] = None, use_gpu: bool = True, task: str = "text-classification", - labels: List[str] = DEFAULT_LABELS, + labels: Optional[List[str]] = None, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, @@ -96,6 +96,8 @@ class TransformersQueryClassifier(BaseQueryClassifier): [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices parameter is not used and a single cpu device is used for inference. """ + if labels is None: + labels = DEFAULT_LABELS super().__init__() resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False) if len(resolved_devices) > 1: diff --git a/haystack/nodes/reader/farm.py b/haystack/nodes/reader/farm.py index 56f74a6cd..c311ec913 100644 --- a/haystack/nodes/reader/farm.py +++ b/haystack/nodes/reader/farm.py @@ -176,7 +176,7 @@ class FARMReader(BaseReader): dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, - devices: List[torch.device] = [], + devices: Optional[List[torch.device]] = None, batch_size: int = 10, n_epochs: int = 2, learning_rate: float = 1e-5, @@ -205,6 +205,8 @@ class FARMReader(BaseReader): doc_stride: Optional[int] = None, max_query_length: Optional[int] = None, ): + if devices is None: + devices = [] if dev_filename: dev_split = 0 @@ -363,7 +365,7 @@ class FARMReader(BaseReader): dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, - devices: List[torch.device] = [], + devices: Optional[List[torch.device]] = None, batch_size: int = 10, n_epochs: int = 2, learning_rate: float = 1e-5, @@ -469,7 +471,7 @@ class FARMReader(BaseReader): dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, - devices: List[torch.device] = [], + devices: Optional[List[torch.device]] = None, batch_size: int = 10, teacher_batch_size: Optional[int] = None, n_epochs: int = 2, @@ -595,7 +597,7 @@ class FARMReader(BaseReader): dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, - devices: List[torch.device] = [], + devices: Optional[List[torch.device]] = None, batch_size: int = 10, teacher_batch_size: Optional[int] = None, n_epochs: int = 5, diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py index fc992ce2c..b352986d8 100644 --- a/haystack/nodes/retriever/dense.py +++ b/haystack/nodes/retriever/dense.py @@ -794,7 +794,7 @@ class TableTextRetriever(DenseRetriever): top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, - embed_meta_fields: List[str] = ["name", "section_title", "caption"], + embed_meta_fields: Optional[List[str]] = None, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, @@ -825,7 +825,8 @@ class TableTextRetriever(DenseRetriever): then used to create the embedding. This is the approach used in the original paper and is likely to improve performance if your titles contain meaningful information for retrieval - (topic, entities etc.). + (topic, entities etc.). If no value is provided, a default will be created. + That default embeds name, section title and caption. :param use_fast_tokenizers: Whether to use fast Rust tokenizers :param similarity_function: Which function to apply for calculating the similarity of query and passage embeddings during training. Options: `dot_product` (Default) or `cosine` @@ -849,6 +850,8 @@ class TableTextRetriever(DenseRetriever): Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. :param use_fast: Whether to use the fast version of DPR tokenizers or fallback to the standard version. Defaults to True. """ + if embed_meta_fields is None: + embed_meta_fields = ["name", "section_title", "caption"] super().__init__() self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=True) @@ -1225,7 +1228,7 @@ class TableTextRetriever(DenseRetriever): max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, - embed_meta_fields: List[str] = ["page_title", "section_title", "caption"], + embed_meta_fields: Optional[List[str]] = None, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, @@ -1260,7 +1263,7 @@ class TableTextRetriever(DenseRetriever): :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None. :param batch_size: Total number of samples in 1 batch of data. :param embed_meta_fields: Concatenate meta fields with each passage and table. - The default setting in official MMRetrieval embeds page title, + If no value is provided, a default will be created. That default embeds page title, section title and caption with the corresponding table and title with corresponding text passage. :param num_hard_negatives: Number of hard negative passages (passages which are @@ -1290,6 +1293,8 @@ class TableTextRetriever(DenseRetriever): :param checkpoints_to_keep: The maximum number of train checkpoints to save. :param early_stopping: An initialized EarlyStopping object to control early stopping and saving of the best models. """ + if embed_meta_fields is None: + embed_meta_fields = ["page_title", "section_title", "caption"] self.processor.embed_meta_fields = embed_meta_fields self.processor.data_dir = Path(data_dir) @@ -1393,7 +1398,7 @@ class TableTextRetriever(DenseRetriever): max_seq_len_table: int = 256, use_gpu: bool = True, batch_size: int = 16, - embed_meta_fields: List[str] = ["name", "section_title", "caption"], + embed_meta_fields: Optional[List[str]] = None, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", @@ -1403,6 +1408,8 @@ class TableTextRetriever(DenseRetriever): """ Load TableTextRetriever from the specified directory. """ + if embed_meta_fields is None: + embed_meta_fields = ["name", "section_title", "caption"] load_dir = Path(load_dir) mm_retriever = cls( @@ -1441,7 +1448,7 @@ class EmbeddingRetriever(DenseRetriever): devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, - embed_meta_fields: List[str] = [], + embed_meta_fields: Optional[List[str]] = None, api_key: Optional[str] = None, ): """ @@ -1494,10 +1501,13 @@ class EmbeddingRetriever(DenseRetriever): This approach is also used in the TableTextRetriever paper and is likely to improve performance if your titles contain meaningful information for retrieval (topic, entities etc.). + If no value is provided, a default empty list will be created. :param api_key: The OpenAI API key or the Cohere API key. Required if one wants to use OpenAI/Cohere embeddings. For more details see https://beta.openai.com/account/api-keys and https://dashboard.cohere.ai/api-keys """ + if embed_meta_fields is None: + embed_meta_fields = [] super().__init__() self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=True) @@ -1929,7 +1939,7 @@ class MultihopEmbeddingRetriever(EmbeddingRetriever): devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, - embed_meta_fields: List[str] = [], + embed_meta_fields: Optional[List[str]] = None, ): """ :param document_store: An instance of DocumentStore from which to retrieve documents. @@ -1977,7 +1987,10 @@ class MultihopEmbeddingRetriever(EmbeddingRetriever): This approach is also used in the TableTextRetriever paper and is likely to improve performance if your titles contain meaningful information for retrieval (topic, entities etc.). + If no value is provided, a default empty list will be created. """ + if embed_meta_fields is None: + embed_meta_fields = [] super().__init__( embedding_model=embedding_model, document_store=document_store, diff --git a/haystack/nodes/retriever/multimodal/embedder.py b/haystack/nodes/retriever/multimodal/embedder.py index 58c67458a..9267192bd 100644 --- a/haystack/nodes/retriever/multimodal/embedder.py +++ b/haystack/nodes/retriever/multimodal/embedder.py @@ -44,7 +44,7 @@ class MultiModalEmbedder: embedding_models: Dict[str, Union[Path, str]], # replace str with ContentTypes starting from Python3.8 feature_extractors_params: Optional[Dict[str, Dict[str, Any]]] = None, batch_size: int = 16, - embed_meta_fields: List[str] = ["name"], + embed_meta_fields: Optional[List[str]] = None, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, @@ -67,6 +67,7 @@ class MultiModalEmbedder: This is the approach used in the original paper and is likely to improve performance if your titles contain meaningful information for retrieval (topic, entities etc.). + If no value is provided, a default with "name" as embedding field is created. :param progress_bar: Whether to show a tqdm progress bar or not. Can be helpful to disable in production deployments to keep the logs clean. :param devices: List of GPU (or CPU) devices to limit inference to certain GPUs and not use all available ones. @@ -78,6 +79,8 @@ class MultiModalEmbedder: the local token is used, which must be previously created using `transformer-cli login`. For more information, see [Hugging Face documentation](https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained) """ + if embed_meta_fields is None: + embed_meta_fields = ["name"] super().__init__() self.devices = get_devices(devices) diff --git a/haystack/nodes/retriever/multimodal/retriever.py b/haystack/nodes/retriever/multimodal/retriever.py index d121eb0e2..df366defa 100644 --- a/haystack/nodes/retriever/multimodal/retriever.py +++ b/haystack/nodes/retriever/multimodal/retriever.py @@ -22,11 +22,11 @@ class MultiModalRetriever(DenseRetriever): query_embedding_model: Union[Path, str], document_embedding_models: Dict[str, Union[Path, str]], # Replace str with ContentTypes starting Python3.8 query_type: str = "text", # Replace str with ContentTypes starting Python3.8 - query_feature_extractor_params: Dict[str, Any] = {"max_length": 64}, - document_feature_extractors_params: Dict[str, Dict[str, Any]] = {"text": {"max_length": 256}}, + query_feature_extractor_params: Optional[Dict[str, Any]] = None, + document_feature_extractors_params: Optional[Dict[str, Dict[str, Any]]] = None, top_k: int = 10, batch_size: int = 16, - embed_meta_fields: List[str] = ["name"], + embed_meta_fields: Optional[List[str]] = None, similarity_function: str = "dot_product", progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, @@ -46,14 +46,14 @@ class MultiModalRetriever(DenseRetriever): checkpoint with the content type it should handle ("text", "table", "image", and so on). The format equals the one used by Hugging Face transformers' modelhub models. :param query_type: The content type of the query ("text", "image" and so on). - :param query_feature_extraction_params: The parameters to pass to the feature extractor of the query. - :param document_feature_extraction_params: The parameters to pass to the feature extractor of the documents. + :param query_feature_extraction_params: The parameters to pass to the feature extractor of the query. If no value is provided, a default dictionary with "max_length": 64 will be set. + :param document_feature_extraction_params: The parameters to pass to the feature extractor of the documents. If no value is provided, a default dictionary with "text": {"max_length": 256} will be set. :param top_k: How many documents to return per query. :param batch_size: Number of questions or documents to encode at once. For multiple GPUs, this is the total batch size. :param embed_meta_fields: Concatenate the provided meta fields to a (text) pair that is then used to create the embedding. This is likely to improve performance if your titles contain meaningful information - for retrieval (topic, entities, and so on). Note that only text and table documents support this feature. + for retrieval (topic, entities, and so on). Note that only text and table documents support this feature. If no values is provided, a default with "name" as embedding field will be created. :param similarity_function: Which function to apply for calculating the similarity of query and document embeddings during training. Options: `dot_product` (default) or `cosine`. :param progress_bar: Whether to show a tqdm progress bar or not. @@ -72,6 +72,12 @@ class MultiModalRetriever(DenseRetriever): range are scaled to a range of [0,1], where 1 means extremely relevant. Otherwise raw similarity scores (for example, cosine or dot_product) are used. """ + if query_feature_extractor_params is None: + query_feature_extractor_params = {"max_length": 64} + if document_feature_extractors_params is None: + document_feature_extractors_params = {"text": {"max_length": 256}} + if embed_meta_fields is None: + embed_meta_fields = ["name"] super().__init__() self.similarity_function = similarity_function diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index e6065ae8c..747581959 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -745,12 +745,12 @@ class Pipeline: cls, index_pipeline: Pipeline, query_pipeline: Pipeline, - index_params: dict = {}, - query_params: dict = {}, + index_params: Optional[Dict] = None, + query_params: Optional[Dict] = None, dataset: str = "scifact", dataset_dir: Path = Path("."), num_documents: Optional[int] = None, - top_k_values: List[int] = [1, 3, 5, 10, 100, 1000], + top_k_values: Optional[List[int]] = None, keep_index: bool = False, ) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float]]: """ @@ -765,7 +765,7 @@ class Pipeline: :param dataset_dir: The directory to store the dataset to. :param num_documents: Maximum number of documents to load from given dataset. If set to None (default) or to a value larger than the number of documents in the dataset, the full dataset is loaded. - :param top_k_values: The top_k values each metric will be calculated for. + :param top_k_values: The top_k values each metric will be calculated for. By default, the values are 1, 3, 5, 10, 100, and 1000. :param keep_index: Whether to keep the index after evaluation. If True the index will be kept after beir evaluation. Otherwise it will be deleted immediately afterwards. Defaults to False. @@ -773,6 +773,12 @@ class Pipeline: Returns a tuple containing the ncdg, map, recall and precision scores. Each metric is represented by a dictionary containing the scores for each top_k value. """ + if index_params is None: + index_params = {} + if query_params is None: + query_params = {} + if top_k_values is None: + top_k_values = [1, 3, 5, 10, 100, 1000] try: from beir import util from beir.datasets.data_loader import GenericDataLoader @@ -855,11 +861,11 @@ class Pipeline: experiment_tracking_tool: Literal["mlflow", None] = None, experiment_tracking_uri: Optional[str] = None, corpus_file_metas: Optional[List[Dict[str, Any]]] = None, - corpus_meta: Dict[str, Any] = {}, - evaluation_set_meta: Dict[str, Any] = {}, - pipeline_meta: Dict[str, Any] = {}, - index_params: dict = {}, - query_params: dict = {}, + corpus_meta: Optional[Dict[str, Any]] = None, + evaluation_set_meta: Optional[Dict[str, Any]] = None, + pipeline_meta: Optional[Dict[str, Any]] = None, + index_params: Optional[Dict] = None, + query_params: Optional[Dict] = None, sas_model_name_or_path: Optional[str] = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, @@ -997,6 +1003,17 @@ class Pipeline: Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scoring ~75 in total. :param context_matching_threshold: Score threshold that candidates must surpass to be included into the result list. Range: [0,100] """ + if corpus_meta is None: + corpus_meta = {} + if evaluation_set_meta is None: + evaluation_set_meta = {} + if pipeline_meta is None: + pipeline_meta = {} + if index_params is None: + index_params = {} + if query_params is None: + query_params = {} + if experiment_tracking_tool is not None: tracking_head_cls = TRACKING_TOOL_TO_HEAD.get(experiment_tracking_tool, None) if tracking_head_cls is None: @@ -2213,7 +2230,7 @@ class Pipeline: "document_id_or_answer", ] = "document_id_or_answer", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any", - wrong_examples_fields: List[str] = ["answer", "context", "document_id"], + wrong_examples_fields: Optional[List[str]] = None, max_characters_per_field: int = 150, ): """ @@ -2249,9 +2266,11 @@ class Pipeline: - 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well. The default value is 'any'. In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'. - :param wrong_examples_fields: A list of fields to include in the worst samples. + :param wrong_examples_fields: A list of fields to include in the worst samples. By default, "answer", "context", and "document_id" are included. :param max_characters_per_field: The maximum number of characters to include in the worst samples report (per field). """ + if wrong_examples_fields is None: + wrong_examples_fields = ["answer", "context", "document_id"] graph = DiGraph(self.graph.edges) print_eval_report( eval_result=eval_result, diff --git a/haystack/pipelines/ray.py b/haystack/pipelines/ray.py index 02ff0c169..94f8a09ef 100644 --- a/haystack/pipelines/ray.py +++ b/haystack/pipelines/ray.py @@ -202,7 +202,7 @@ class RayPipeline(Pipeline): @classmethod def _create_ray_deployment( - cls, component_name: str, pipeline_config: dict, serve_deployment_kwargs: Optional[Dict[str, Any]] = {} + cls, component_name: str, pipeline_config: dict, serve_deployment_kwargs: Optional[Dict[str, Any]] = None ): """ Create a Ray Deployment for the Component. @@ -215,6 +215,8 @@ class RayPipeline(Pipeline): Ray Serve API docs (https://docs.ray.io/en/latest/serve/package-ref.html) under the `ray.serve.deployment()` method """ + if serve_deployment_kwargs is None: + serve_deployment_kwargs = {} RayDeployment = serve.deployment( _RayDeploymentWrapper, name=component_name, **serve_deployment_kwargs # type: ignore ) diff --git a/haystack/pipelines/standard_pipelines.py b/haystack/pipelines/standard_pipelines.py index bd706bea5..b82dd03e1 100644 --- a/haystack/pipelines/standard_pipelines.py +++ b/haystack/pipelines/standard_pipelines.py @@ -241,7 +241,7 @@ class BaseStandardPipeline(ABC): "document_id_or_answer", ] = "document_id_or_answer", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any", - wrong_examples_fields: List[str] = ["answer", "context", "document_id"], + wrong_examples_fields: Optional[List[str]] = None, max_characters_per_field: int = 150, ): """ @@ -277,9 +277,11 @@ class BaseStandardPipeline(ABC): - 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well. The default value is 'any'. In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'. - :param wrong_examples_fields: A list of field names to include in the worst samples. + :param wrong_examples_fields: A list of field names to include in the worst samples. By default, "answer", "context", and "document_id" are used. :param max_characters_per_field: The maximum number of characters per wrong example to show (per field). """ + if wrong_examples_fields is None: + wrong_examples_fields = ["answer", "context", "document_id"] if metrics_filter is None: metrics_filter = self.metrics_filter self.pipeline.print_eval_report( diff --git a/haystack/pipelines/utils.py b/haystack/pipelines/utils.py index 784bdd392..b88a83d91 100644 --- a/haystack/pipelines/utils.py +++ b/haystack/pipelines/utils.py @@ -178,7 +178,7 @@ def print_eval_report( "document_id", "context", "document_id_and_context", "document_id_or_context", "answer", "document_id_or_answer" ] = "document_id_or_answer", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any", - wrong_examples_fields: List[str] = ["answer", "context", "document_id"], + wrong_examples_fields: Optional[List[str]] = None, max_characters_per_field: int = 150, ): """ @@ -216,9 +216,11 @@ def print_eval_report( - 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well. The default value is 'any'. In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'. - :param wrong_examples_fields: A list of field names that should be included in the wrong examples. + :param wrong_examples_fields: A list of field names that should be included in the wrong examples. By default, "answer", "context", and "document_id" are used. :param max_characters_per_field: The maximum number of characters to show in the wrong examples report (per field). """ + if wrong_examples_fields is None: + wrong_examples_fields = ["answer", "context", "document_id"] if any(degree > 1 for node, degree in graph.out_degree): logger.warning("Pipelines with junctions are currently not supported.") return @@ -309,9 +311,11 @@ def _format_wrong_examples_report( "document_id", "context", "document_id_and_context", "document_id_or_context", "answer", "document_id_or_answer" ] = "document_id_or_answer", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any", - fields: List[str] = ["answer", "context", "document_id"], + fields: Optional[List[str]] = None, max_chars: int = 150, ): + if fields is None: + fields = ["answer", "context", "document_id"] examples = { node: eval_result.wrong_examples( node, document_scope=document_scope, answer_scope=answer_scope, n=n_wrong_examples diff --git a/haystack/schema.py b/haystack/schema.py index c808b180e..a3f039ad5 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -302,7 +302,9 @@ class SpeechDocument(Document): return f"" return f" 100 else ''}', content_audio={self.content_audio}>" - def to_dict(self, field_map={}) -> Dict: + def to_dict(self, field_map=None) -> Dict: + if field_map is None: + field_map = {} dictionary = super().to_dict(field_map=field_map) for key, value in dictionary.items(): if isinstance(value, Path): @@ -310,7 +312,9 @@ class SpeechDocument(Document): return dictionary @classmethod - def from_dict(cls, dict, field_map={}, id_hash_keys=None): + def from_dict(cls, dict, field_map=None, id_hash_keys=None): + if field_map is None: + field_map = {} doc = super().from_dict(dict=dict, field_map=field_map, id_hash_keys=id_hash_keys) doc.content_audio = Path(dict["content_audio"]) return doc diff --git a/haystack/telemetry.py b/haystack/telemetry.py index c7fa56a2c..a6211ff29 100644 --- a/haystack/telemetry.py +++ b/haystack/telemetry.py @@ -133,7 +133,7 @@ def send_event(func): return wrapper -def send_custom_event(event: str = "", payload: Dict[str, Any] = {}): +def send_custom_event(event: str = "", payload: Optional[Dict[str, Any]] = None): """ This method can be called directly from anywhere in Haystack to send an event. Enriches the given event with metadata and sends it to the posthog server if telemetry is enabled. @@ -143,6 +143,8 @@ def send_custom_event(event: str = "", payload: Dict[str, Any] = {}): :param payload: A dictionary containing event meta data, e.g., parameter settings """ global user_id # pylint: disable=global-statement + if payload is None: + payload = {} try: def send_request(payload: Dict[str, Any]): diff --git a/haystack/utils/deepsetcloud.py b/haystack/utils/deepsetcloud.py index bb62471bb..3f1116d22 100644 --- a/haystack/utils/deepsetcloud.py +++ b/haystack/utils/deepsetcloud.py @@ -144,7 +144,7 @@ class DeepsetCloudClient: def post( self, url: str, - json: dict = {}, + json: Optional[Dict] = None, data: Optional[Any] = None, query_params: Optional[dict] = None, headers: Optional[dict] = None, @@ -152,6 +152,8 @@ class DeepsetCloudClient: files: Optional[Any] = None, raise_on_error: bool = True, ): + if json is None: + json = {} return self._execute_request( method="POST", url=url, @@ -167,7 +169,7 @@ class DeepsetCloudClient: def post_with_auto_paging( self, url: str, - json: dict = {}, + json: Optional[Dict] = None, data: Optional[Any] = None, query_params: Optional[dict] = None, headers: Optional[dict] = None, @@ -175,6 +177,8 @@ class DeepsetCloudClient: raise_on_error: bool = True, auto_paging_page_size: Optional[int] = None, ): + if json is None: + json = {} return self._execute_auto_paging_request( method="POST", url=url, @@ -211,7 +215,7 @@ class DeepsetCloudClient: def put_with_auto_paging( self, url: str, - json: dict = {}, + json: Optional[Dict] = None, data: Optional[Any] = None, query_params: Optional[dict] = None, headers: Optional[dict] = None, @@ -219,6 +223,8 @@ class DeepsetCloudClient: raise_on_error: bool = True, auto_paging_page_size: Optional[int] = None, ): + if json is None: + json = {} return self._execute_auto_paging_request( method="PUT", url=url, diff --git a/pyproject.toml b/pyproject.toml index f424a5ec9..8c7ceffdd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -278,7 +278,6 @@ disable = [ "unspecified-encoding", "unidiomatic-typecheck", "no-name-in-module", - "dangerous-default-value", "consider-using-with", "redefined-outer-name", "arguments-renamed",