chore: enable logging-fstring-interpolation and cleanup (#3843)

* enable logging-fstring-interpolation

* remove logging-fstring-interpolation from exclusion list

* remove implicit string interpolations added by black

* remove from rest_api too

* fix % sign
This commit is contained in:
ZanSara 2023-01-12 09:31:21 +01:00 committed by GitHub
parent 4cbc8550d6
commit d157e41c1f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
64 changed files with 408 additions and 249 deletions

View File

@ -476,8 +476,9 @@ class BaseDocumentStore(BaseComponent):
else:
jsonl_filename = (file_path.parent / (file_path.stem + ".jsonl")).as_posix()
logger.info(
f"Adding evaluation data batch-wise is not compatible with json-formatted SQuAD files. "
f"Converting json to jsonl to: {jsonl_filename}"
"Adding evaluation data batch-wise is not compatible with json-formatted SQuAD files. "
"Converting json to jsonl to: %s",
jsonl_filename,
)
squad_json_to_jsonl(filename, jsonl_filename)
self.add_eval_data(
@ -622,8 +623,9 @@ class BaseDocumentStore(BaseComponent):
for document in documents:
if document.id in _hash_ids:
logger.info(
f"Duplicate Documents: Document with id '{document.id}' already exists in index "
f"'{index or self.index}'"
"Duplicate Documents: Document with id '%s' already exists in index '%s'",
document.id,
index or self.index,
)
continue
_documents.append(document)

View File

@ -118,23 +118,25 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
indexing_info = index_info["indexing"]
if indexing_info["pending_file_count"] > 0:
logger.warning(
f"{indexing_info['pending_file_count']} files are pending to be indexed. "
f"Indexing status: {indexing_info['status']}"
"%s files are pending to be indexed. Indexing status: %s",
indexing_info["pending_file_count"],
indexing_info["status"],
)
if index in deployed_unhealthy_pipelines:
logger.warning(
f"The index '{index}' is unhealthy and should be redeployed using "
f"`Pipeline.undeploy_on_deepset_cloud()` and `Pipeline.deploy_on_deepset_cloud()`."
"The index '%s' is unhealthy and should be redeployed using "
"`Pipeline.undeploy_on_deepset_cloud()` and `Pipeline.deploy_on_deepset_cloud()`.",
index,
)
else:
logger.info(
f"You are using a DeepsetCloudDocumentStore with an index that does not exist on deepset Cloud. "
f"This document store always returns empty responses. This can be useful if you want to "
f"create a new pipeline within deepset Cloud.\n"
f"In order to create a new pipeline on deepset Cloud, take the following steps: \n"
f" - create query and indexing pipelines using this DocumentStore\n"
f" - call `Pipeline.save_to_deepset_cloud()` passing the pipelines and a `pipeline_config_name`\n"
f" - call `Pipeline.deploy_on_deepset_cloud()` passing the `pipeline_config_name`"
"You are using a DeepsetCloudDocumentStore with an index that does not exist on deepset Cloud. "
"This document store always returns empty responses. This can be useful if you want to "
"create a new pipeline within deepset Cloud.\n"
"In order to create a new pipeline on deepset Cloud, take the following steps: \n"
" - create query and indexing pipelines using this DocumentStore\n"
" - call `Pipeline.save_to_deepset_cloud()` passing the pipelines and a `pipeline_config_name`\n"
" - call `Pipeline.deploy_on_deepset_cloud()` passing the `pipeline_config_name`"
)
self.evaluation_set_client = DeepsetCloud.get_evaluation_set_client(

View File

@ -508,9 +508,10 @@ class ElasticsearchDocumentStore(SearchEngineDocumentStore):
if not any(indices):
logger.warning(
f"To use an index, you must create it first. The index called '{index_name}' doesn't exist. "
f"You can create it by setting `create_index=True` on init or by calling `write_documents()` if you prefer to create it on demand. "
f"Note that this instance doesn't validate the index after you create it."
"To use an index, you must create it first. The index called '%s' doesn't exist. "
"You can create it by setting `create_index=True` on init or by calling `write_documents()` if you prefer to create it on demand. "
"Note that this instance doesn't validate the index after you create it.",
index_name,
)
# If the index name is an alias that groups multiple existing indices, each of them must have an embedding_field.

View File

@ -206,7 +206,10 @@ class FAISSDocumentStore(SQLDocumentStore):
index.hnsw.efConstruction = ef_construction
logger.info(
f"HNSW params: n_links: {n_links}, efSearch: {index.hnsw.efSearch}, efConstruction: {index.hnsw.efConstruction}"
"HNSW params: n_links: %s, efSearch: %s, efConstruction: %s",
n_links,
index.hnsw.efSearch,
index.hnsw.efConstruction,
)
else:
index = faiss.index_factory(embedding_dim, index_factory, metric_type)
@ -550,8 +553,10 @@ class FAISSDocumentStore(SQLDocumentStore):
"""
if index == self.index:
logger.warning(
f"Deletion of default index '{index}' detected. "
f"If you plan to use this index again, please reinstantiate '{self.__class__.__name__}' in order to avoid side-effects."
"Deletion of default index '%s' detected. "
"If you plan to use this index again, please reinstantiate '%s' in order to avoid side-effects.",
index,
self.__class__.__name__,
)
if index in self.faiss_indexes:
del self.faiss_indexes[index]

View File

@ -110,8 +110,9 @@ class InMemoryDocumentStore(KeywordDocumentStore):
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=self.use_gpu, multi_gpu=False)
if len(self.devices) > 1:
logger.warning(
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
f"using the first device {self.devices[0]}."
"Multiple devices are not supported in %s inference, using the first device %s.",
self.__class__.__name__,
self.devices[0],
)
self.main_device = self.devices[0]
@ -184,7 +185,7 @@ class InMemoryDocumentStore(KeywordDocumentStore):
)
if duplicate_documents == "skip":
logger.warning(
f"Duplicate Documents: Document with id '{document.id} already exists in index " f"'{index}'"
"Duplicate Documents: Document with id '%s' already exists in index '%s'", document.id, index
)
continue
self.indexes[index][document.id] = document
@ -205,8 +206,9 @@ class InMemoryDocumentStore(KeywordDocumentStore):
textual_documents = [doc for doc in all_documents if doc.content_type == "text"]
if len(textual_documents) < len(all_documents):
logger.warning(
f"Some documents in {index} index are non-textual."
f" They will be written to the index, but the corresponding BM25 representations will not be generated."
"Some documents in %s index are non-textual."
" They will be written to the index, but the corresponding BM25 representations will not be generated.",
index,
)
tokenized_corpus = [
@ -236,10 +238,11 @@ class InMemoryDocumentStore(KeywordDocumentStore):
duplicate_ids: list = [label.id for label in self._get_duplicate_labels(label_objects, index=index)]
if len(duplicate_ids) > 0:
logger.warning(
f"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
f" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
f" the answer annotation and not the question."
f" Problematic ids: {','.join(duplicate_ids)}"
"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
" the answer annotation and not the question."
" Problematic ids: %s",
",".join(duplicate_ids),
)
for label in label_objects:

View File

@ -497,8 +497,10 @@ class MilvusDocumentStore(SQLDocumentStore):
"""
if index == self.index:
logger.warning(
f"Deletion of default index '{index}' detected. "
f"If you plan to use this index again, please reinstantiate '{self.__class__.__name__}' in order to avoid side-effects."
"Deletion of default index '%s' detected. "
"If you plan to use this index again, please reinstantiate '%s' in order to avoid side-effects.",
index,
self.__class__.__name__,
)
self._delete_index(index)

View File

@ -527,9 +527,10 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
if not any(indices):
# We don't want to raise here as creating a query-only document store before the index being created asynchronously is a valid use case.
logger.warning(
f"Before you can use an index, you must create it first. The index '{index_name}' doesn't exist. "
f"You can create it by setting `create_index=True` on init or by calling `write_documents()` if you prefer to create it on demand. "
f"Note that this instance doesn't validate the index after you created it."
"Before you can use an index, you must create it first. The index '%s' doesn't exist. "
"You can create it by setting `create_index=True` on init or by calling `write_documents()` if you prefer to create it on demand. "
"Note that this instance doesn't validate the index after you created it.",
index_name,
)
# If the index name is an alias that groups multiple existing indices, each of them must have an embedding_field.
@ -583,11 +584,11 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
if self.index_type == "hnsw" and ef_search != 20:
body = {"knn.algo_param.ef_search": 20}
self.client.indices.put_settings(index=index_id, body=body, headers=headers)
logger.info(f"Set ef_search to 20 for hnsw index '{index_id}'.")
logger.info("Set ef_search to 20 for hnsw index '%s'.", index_id)
elif self.index_type == "flat" and ef_search != 512:
body = {"knn.algo_param.ef_search": 512}
self.client.indices.put_settings(index=index_id, body=body, headers=headers)
logger.info(f"Set ef_search to 512 for hnsw index '{index_id}'.")
logger.info("Set ef_search to 512 for hnsw index '%s'.", index_id)
def _validate_approximate_knn_settings(
self, existing_embedding_field: Dict[str, Any], index_settings: Dict[str, Any], index_id: str

View File

@ -216,7 +216,10 @@ class SearchEngineDocumentStore(KeywordDocumentStore):
except Exception as e:
if hasattr(e, "status_code") and e.status_code == 429: # type: ignore
logger.warning(
f"Failed to insert a batch of '{len(documents)}' documents because of a 'Too Many Requeset' response. Splitting the number of documents into two chunks with the same size and retrying in {_timeout} seconds."
"Failed to insert a batch of '%s' documents because of a 'Too Many Requeset' response. "
"Splitting the number of documents into two chunks with the same size and retrying in %s seconds.",
len(documents),
_timeout,
)
if len(documents) == 1:
logger.warning(
@ -478,10 +481,11 @@ class SearchEngineDocumentStore(KeywordDocumentStore):
duplicate_ids: list = [label.id for label in self._get_duplicate_labels(label_list, index=index)]
if len(duplicate_ids) > 0:
logger.warning(
f"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
f" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
f" the answer annotation and not the question."
f" Problematic ids: {','.join(duplicate_ids)}"
"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
" the answer annotation and not the question."
" Problematic ids: %s",
",".join(duplicate_ids),
)
labels_to_index = []
for label in label_list:
@ -1087,7 +1091,8 @@ class SearchEngineDocumentStore(KeywordDocumentStore):
if not isinstance(query, str):
logger.warning(
"The query provided seems to be not a string, but an object "
f"of type {type(query)}. This can cause the query to fail."
"of type %s. This can cause the query to fail.",
type(query),
)
operator = "AND" if all_terms_must_match else "OR"
body = {
@ -1599,8 +1604,10 @@ class SearchEngineDocumentStore(KeywordDocumentStore):
"""
if index == self.index:
logger.warning(
f"Deletion of default index '{index}' detected. "
f"If you plan to use this index again, please reinstantiate '{self.__class__.__name__}' in order to avoid side-effects."
"Deletion of default index '%s' detected. "
"If you plan to use this index again, please reinstantiate '%s' in order to avoid side-effects.",
index,
self.__class__.__name__,
)
self._delete_index(index)

View File

@ -448,10 +448,11 @@ class SQLDocumentStore(BaseDocumentStore):
duplicate_ids: list = [label.id for label in self._get_duplicate_labels(labels, index=index)]
if len(duplicate_ids) > 0:
logger.warning(
f"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
f" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
f" the answer annotation and not the question."
f" Problematic ids: {','.join(duplicate_ids)}"
"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
" the answer annotation and not the question."
" Problematic ids: %s",
",".join(duplicate_ids),
)
# TODO: Use batch_size

View File

@ -52,8 +52,9 @@ def eval_data_from_json(
problematic_ids.extend(cur_problematic_ids)
if len(problematic_ids) > 0:
logger.warning(
f"Could not convert an answer for {len(problematic_ids)} questions.\n"
f"There were conversion errors for question ids: {problematic_ids}"
"Could not convert an answer for %s questions.\nThere were conversion errors for question ids: %s",
len(problematic_ids),
problematic_ids,
)
return docs, labels
@ -99,8 +100,10 @@ def eval_data_from_jsonl(
if len(docs) >= batch_size:
if len(problematic_ids) > 0:
logger.warning(
f"Could not convert an answer for {len(problematic_ids)} questions.\n"
f"There were conversion errors for question ids: {problematic_ids}"
"Could not convert an answer for %s questions.\n"
"There were conversion errors for question ids: %s",
len(problematic_ids),
problematic_ids,
)
yield docs, labels
docs = []

View File

@ -358,7 +358,9 @@ class WeaviateDocumentStore(KeywordDocumentStore):
generated_uuid = str(uuid.UUID(hashed_id.hexdigest()[::2]))
if not self.uuid_format_warning_raised:
logger.warning(
f"Document id {id} is not in uuid format. Such ids will be replaced by uuids, in this case {generated_uuid}."
"Document id %s is not in uuid format. Such ids will be replaced by uuids, in this case %s.",
id,
generated_uuid,
)
self.uuid_format_warning_raised = True
id = generated_uuid
@ -1507,8 +1509,10 @@ class WeaviateDocumentStore(KeywordDocumentStore):
"""
if index == self.index:
logger.warning(
f"Deletion of default index '{index}' detected. "
f"If you plan to use this index again, please reinstantiate '{self.__class__.__name__}' in order to avoid side-effects."
"Deletion of default index '%s' detected. "
"If you plan to use this index again, please reinstantiate '%s' in order to avoid side-effects.",
index,
self.__class__.__name__,
)
self._delete_index(index)

View File

@ -336,7 +336,9 @@ class DataSilo:
logger.warning("No dev set created. Please adjust the dev_split parameter.")
logger.info(
f"Took {len(dev_dataset)} samples out of train set to create dev set (dev split is roughly {self.processor.dev_split})"
"Took %s samples out of train set to create dev set (dev split is roughly %s)",
len(dev_dataset),
self.processor.dev_split,
)
def random_split_ConcatDataset(self, ds: ConcatDataset, lengths: List[int]):
@ -387,7 +389,7 @@ class DataSilo:
clipped, ave_len, seq_lens, max_seq_len = self._calc_length_stats_biencoder()
else:
logger.warning(
f"Could not compute length statistics because 'input_ids' or 'query_input_ids' and 'passage_input_ids' are missing."
"Could not compute length statistics because 'input_ids' or 'query_input_ids' and 'passage_input_ids' are missing."
)
clipped = -1
ave_len = -1
@ -416,11 +418,14 @@ class DataSilo:
logger.info("Proportion clipped: {}".format(clipped))
if clipped > 0.5:
logger.info(
f"[Haystack Tip] {round(clipped * 100, 1)}% of your samples got cut down to {max_seq_len} tokens. "
"[Haystack Tip] %s%% of your samples got cut down to %s tokens. "
"Consider increasing max_seq_len "
f"(the maximum value allowed with the current model is max_seq_len={self.processor.tokenizer.model_max_length}, "
"(the maximum value allowed with the current model is max_seq_len=%s, "
"if this is not enough consider splitting the document in smaller units or changing the model). "
"This will lead to higher memory consumption but is likely to improve your model performance"
"This will lead to higher memory consumption but is likely to improve your model performance",
round(clipped * 100, 1),
max_seq_len,
self.processor.tokenizer.model_max_length,
)
elif "query_input_ids" in self.tensor_names and "passage_input_ids" in self.tensor_names:
logger.info(

View File

@ -59,9 +59,11 @@ def convert_features_to_dataset(features):
base = check.ravel()[0]
if not np.issubdtype(type(base), np.integer):
logger.warning(
f"Problem during conversion to torch tensors:\n"
f"A non-integer value for feature '{t_name}' with a value of: "
f"'{base}' will be converted to a torch tensor of dtype long."
"Problem during conversion to torch tensors:\n"
"A non-integer value for feature '%s' with a value of: "
"'%s' will be converted to a torch tensor of dtype long.",
t_name,
base,
)
except:
logger.debug(

View File

@ -38,10 +38,11 @@ def sample_to_features_text(sample, tasks, max_seq_len, tokenizer):
if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]):
logger.error(
f"FastTokenizer encoded sample {sample.clear_text['text']} to "
f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs "
f"from number of tokens produced in tokenize_with_metadata(). \n"
f"Further processing is likely to be wrong."
"FastTokenizer encoded sample %s to %s tokens, which differs "
"from number of tokens produced in tokenize_with_metadata(). \n"
"Further processing is likely to be wrong.",
sample.clear_text["text"],
len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1),
)
else:
# TODO It might be cleaner to adjust the data structure in sample.tokenized

View File

@ -565,8 +565,9 @@ class SquadProcessor(Processor):
)
except Exception as e:
logger.warning(
f"Could not devide document into passages. Document: {basket.raw['document_text'][:200]}\n"
f"With error: {e}"
"Could not devide document into passages. Document: %s\nWith error: %s",
basket.raw["document_text"][:200],
e,
)
passage_spans = []
@ -663,8 +664,9 @@ class SquadProcessor(Processor):
# check if answer string can be found in context
if answer_text not in doc_text:
logger.warning(
f"Answer '{answer['text']}' not contained in context.\n"
f"Example will not be converted for training/evaluation."
"Answer '%s' not contained in context.\n"
"Example will not be converted for training/evaluation.",
answer["text"],
)
error_in_answer = True
label_idxs[i][0] = -100 # TODO remove this hack also from featurization
@ -672,8 +674,10 @@ class SquadProcessor(Processor):
break # Break loop around answers, so the error message is not shown multiple times
if answer_indices.strip() != answer_text.strip():
logger.warning(
f"Answer using start/end indices is '{answer_indices}' while gold label text is '{answer_text}'.\n"
f"Example will not be converted for training/evaluation."
"Answer using start/end indices is '%s' while gold label text is '%s'.\n"
"Example will not be converted for training/evaluation.",
answer_indices,
answer_text,
)
error_in_answer = True
label_idxs[i][0] = -100 # TODO remove this hack also from featurization
@ -1025,7 +1029,7 @@ class TextSimilarityProcessor(Processor):
if problematic_ids:
logger.error(
f"There were {len(problematic_ids)} errors during preprocessing at positions: {problematic_ids}"
"There were %s errors during preprocessing at positions: %s", len(problematic_ids), problematic_ids
)
if return_baskets:
@ -1104,7 +1108,7 @@ class TextSimilarityProcessor(Processor):
if len(tokenized_query) == 0:
logger.warning(
f"The query could not be tokenized, likely because it contains a character that the query tokenizer does not recognize"
"The query could not be tokenized, likely because it contains a character that the query tokenizer does not recognize"
)
return None
@ -1222,7 +1226,8 @@ class TextSimilarityProcessor(Processor):
if title is None:
title = ""
logger.warning(
f"Couldn't find title although `embed_title` is set to True for DPR. Using title='' now. Related passage text: '{ctx}' "
"Couldn't find title although `embed_title` is set to True for DPR. Using title='' now. Related passage text: '%s' ",
ctx,
)
res.append(tuple((title, ctx)))
return res
@ -1545,7 +1550,7 @@ class TableTextSimilarityProcessor(Processor):
if problematic_ids:
logger.error(
f"There were {len(problematic_ids)} errors during preprocessing at positions: {problematic_ids}"
"There were %s errors during preprocessing at positions: %s", len(problematic_ids), problematic_ids
)
if return_baskets:
@ -1588,7 +1593,7 @@ class TableTextSimilarityProcessor(Processor):
if len(tokenized_query) == 0:
logger.warning(
f"The query could not be tokenized, likely because it contains a character that the query tokenizer does not recognize"
"The query could not be tokenized, likely because it contains a character that the query tokenizer does not recognize"
)
return None

View File

@ -125,7 +125,8 @@ class Evaluator:
temperature_change = (abs(temperature_current - temperature_previous) / temperature_previous) * 100.0
if temperature_change > 50:
logger.warning(
f"temperature used for calibration of confidence scores changed by more than {temperature_change} percent"
"temperature used for calibration of confidence scores changed by more than %s percent",
temperature_change,
)
if hasattr(head, "aggregate_preds"):
# Needed to convert NQ ids from np arrays to strings
@ -146,8 +147,11 @@ class Evaluator:
result["report"] = compute_report_metrics(head, preds_all[head_num], label_all[head_num])
except:
logger.error(
f"Couldn't create eval report for head {head_num} with following preds and labels:"
f"\n Preds: {preds_all[head_num]} \n Labels: {label_all[head_num]}"
"Couldn't create eval report for head %s with following preds and labels:"
"\n Preds: %s \n Labels: %s",
head_num,
preds_all[head_num],
label_all[head_num],
)
result["report"] = "Error"

View File

@ -77,8 +77,9 @@ class Inferencer:
self.devices, n_gpu = initialize_device_settings(devices=devices, use_cuda=gpu, multi_gpu=False)
if len(self.devices) > 1:
logger.warning(
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
f"using the first device {self.devices[0]}."
"Multiple devices are not supported in %s inference, using the first device %s.",
self.__class__.__name__,
self.devices[0],
)
self.processor = processor
@ -187,9 +188,7 @@ class Inferencer:
devices, n_gpu = initialize_device_settings(devices=devices, use_cuda=gpu, multi_gpu=False)
if len(devices) > 1:
logger.warning(
f"Multiple devices are not supported in Inferencer, " f"using the first device {devices[0]}."
)
logger.warning("Multiple devices are not supported in Inferencer, using the first device %s.", devices[0])
name = os.path.basename(model_name_or_path)

View File

@ -390,8 +390,9 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
for prediction_head in self.prediction_heads:
if len(prediction_head.layer_dims) != 2:
logger.error(
f"Currently conversion only works for PredictionHeads that are a single layer Feed Forward NN with dimensions [LM_output_dim, number_classes].\n"
f" Your PredictionHead has {str(prediction_head.layer_dims)} dimensions."
"Currently conversion only works for PredictionHeads that are a single layer Feed Forward NN with dimensions [LM_output_dim, number_classes].\n"
" Your PredictionHead has %s dimensions.",
str(prediction_head.layer_dims),
)
continue
if prediction_head.model_type == "span_classification":
@ -399,8 +400,8 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
converted_models.append(transformers_model)
else:
logger.error(
f"Haystack -> Transformers conversion is not supported yet for"
f" prediction heads of type {prediction_head.model_type}"
"Haystack -> Transformers conversion is not supported yet for prediction heads of type %s",
prediction_head.model_type,
)
return converted_models

View File

@ -93,7 +93,7 @@ class FeatureExtractor:
with open(config_file) as f:
config = json.load(f)
feature_extractor_classname = config["tokenizer_class"]
logger.debug(f"⛏️ Selected feature extractor: {feature_extractor_classname} (from {config_file})")
logger.debug("⛏️ Selected feature extractor: %s (from %s)", feature_extractor_classname, config_file)
# Use FastTokenizers as much as possible
try:
feature_extractor_class = getattr(transformers, feature_extractor_classname + "Fast")
@ -122,7 +122,7 @@ class FeatureExtractor:
f"\n- {f'{chr(10)}- '.join(FEATURE_EXTRACTORS.keys())}"
) from e
logger.debug(
f"⛏️ Selected feature extractor: {feature_extractor_class.__name__} (for model type '{model_type}')"
"⛏️ Selected feature extractor: %s (for model type '%s')", feature_extractor_class.__name__, model_type
)
self.default_params = DEFAULT_EXTRACTION_PARAMS.get(feature_extractor_class, {})

View File

@ -293,7 +293,7 @@ class HFLanguageModel(LanguageModel):
model_emb_size = self.model.resize_token_embeddings(new_num_tokens=None).num_embeddings
vocab_size = model_emb_size + n_added_tokens
logger.info(
f"Resizing embedding layer of LM from {model_emb_size} to {vocab_size} to cope with custom vocab."
"Resizing embedding layer of LM from %s to %s to cope with custom vocab.", model_emb_size, vocab_size
)
self.model.resize_token_embeddings(vocab_size)
# verify
@ -464,7 +464,7 @@ class HFLanguageModelNoSegmentIds(HFLanguageModelWithPooler):
specified using the arguments `output_hidden_states` and `output_attentions`.
"""
if segment_ids is not None:
logger.warning(f"'segment_ids' is not None, but %s does not use them. They will be ignored.", self.name)
logger.warning("'segment_ids' is not None, but %s does not use them. They will be ignored.", self.name)
return super().forward(
input_ids=input_ids,
@ -636,8 +636,9 @@ class DPREncoder(LanguageModel):
"""
if model_config.model_type.lower() != "bert":
logger.warning(
f"Using a model of type '{model_config.model_type}' which might be incompatible with DPR encoders. "
f"Only Bert-based encoders are supported. They need input_ids, token_type_ids, attention_mask as input tensors."
"Using a model of type '%s' which might be incompatible with DPR encoders. "
"Only Bert-based encoders are supported. They need input_ids, token_type_ids, attention_mask as input tensors.",
model_config.model_type,
)
config_dict = vars(model_config)
if model_kwargs:
@ -876,12 +877,13 @@ def get_language_model(
if not model_type:
logger.error(
f"Model type not understood for '{pretrained_model_name_or_path}' "
f"({model_type if model_type else 'model_type not set'}). "
"Model type not understood for '%s' (%s). "
"Either supply the local path for a saved model, "
"or the name of a model that can be downloaded from the Model Hub. "
"Ensure that the model class name can be inferred from the directory name "
"when loading a Transformers model."
"when loading a Transformers model.",
pretrained_model_name_or_path,
model_type if model_type else "model_type not set",
)
logger.error("Using the AutoModel class for '%s'. This can cause crashes!", pretrained_model_name_or_path)
model_type = "Auto"
@ -957,7 +959,7 @@ def _get_model_type(
if model_type and model_type.lower() == "roberta" and "mlm" in model_name_or_path.lower():
logger.error(
f"MLM part of codebert is currently not supported in Haystack: '{model_name_or_path}' may crash later."
"MLM part of codebert is currently not supported in Haystack: '%s' may crash later.", model_name_or_path
)
return model_type

View File

@ -88,13 +88,14 @@ def get_model(
config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name, **autoconfig_kwargs)
model_type = config.model_type
except Exception as e:
logger.debug(f"Can't find model type for {pretrained_model_name_or_path}: {e}")
logger.debug("Can't find model type for %s: %s", pretrained_model_name_or_path, e)
if feature_extractor_kwargs is not None:
logger.debug(
"Can't forward feature_extractor_kwargs to a SentenceTransformers model. "
"These kwargs are being dropped. "
f"Content of feature_extractor_kwargs: {feature_extractor_kwargs}"
"Content of feature_extractor_kwargs: %s",
feature_extractor_kwargs,
)
else:
@ -102,9 +103,10 @@ def get_model(
config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name, **autoconfig_kwargs)
if not config.model_type:
logger.error(
f"Model type not understood for '{pretrained_model_name_or_path}'. Please provide the name of "
"Model type not understood for '%s'. Please provide the name of "
"a model that can be downloaded from the Model Hub.\nUsing the AutoModel class. "
"THIS CAN CAUSE CRASHES and won't work for models that are not working with text."
"THIS CAN CAUSE CRASHES and won't work for models that are not working with text.",
pretrained_model_name_or_path,
)
model_type = None
else:
@ -112,10 +114,13 @@ def get_model(
model_type = HUGGINGFACE_CAPITALIZE[config.model_type.lower()]
except KeyError as e:
logger.error(
f"Haystack doesn't support model '{pretrained_model_name_or_path}' (type '{config.model_type.lower()}') "
"Haystack doesn't support model '%s' (type '%s') "
"We'll use the AutoModel class for it. "
"THIS CAN CAUSE CRASHES and won't work for models that are not working with text. "
f"Supported model types: {', '.join(HUGGINGFACE_CAPITALIZE.keys())}"
"Supported model types: %s",
pretrained_model_name_or_path,
config.model_type.lower(),
", ".join(HUGGINGFACE_CAPITALIZE.keys()),
)
model_type = None

View File

@ -25,9 +25,11 @@ class HaystackModel(ABC):
See the values of `haystack.schema.ContentTypes`.
"""
logger.info(
f" 🤖 Loading '{pretrained_model_name_or_path}' "
f"({self.__class__.__name__} of type '{model_type if model_type else '<unknown>'}' "
f"for {content_type} data)"
" 🤖 Loading '%s' (%s of type '%s' for %s data)",
pretrained_model_name_or_path,
self.__class__.__name__,
model_type if model_type else "<unknown>",
content_type,
)
self.model_name_or_path = pretrained_model_name_or_path
self.model_type = model_type

View File

@ -164,8 +164,11 @@ class PredictionHead(nn.Module):
return
new_dims = [input_dim] + old_dims[1:]
logger.info(
f"Resizing input dimensions of {type(self).__name__} ({self.task_name}) "
f"from {old_dims} to {new_dims} to match language model"
"Resizing input dimensions of %s (%s) from %s to %s to match language model",
type(self).__name__,
self.task_name,
old_dims,
new_dims,
)
self.feed_forward = FeedForwardBlock(new_dims)
self.layer_dims[0] = input_dim
@ -260,8 +263,8 @@ class QuestionAnsweringHead(PredictionHead):
super(QuestionAnsweringHead, self).__init__()
if len(kwargs) > 0:
logger.warning(
f"Some unused parameters are passed to the QuestionAnsweringHead. "
f"Might not be a problem. Params: {json.dumps(kwargs)}"
"Some unused parameters are passed to the QuestionAnsweringHead. Might not be a problem. Params: %s",
json.dumps(kwargs),
)
self.layer_dims = layer_dims
assert self.layer_dims[-1] == 2

View File

@ -105,20 +105,23 @@ class QACandidate:
self.answer = "no_answer"
if self.offset_answer_start != 0 or self.offset_answer_end != 0:
logger.error(
f"Both start and end offsets should be 0: \n"
f"{self.offset_answer_start}, {self.offset_answer_end} with a no_answer. "
"Both start and end offsets should be 0: \n%s, %s with a no_answer. ",
self.offset_answer_start,
self.offset_answer_end,
)
else:
self.answer = string
if self.offset_answer_end - self.offset_answer_start <= 0:
logger.error(
f"End offset comes before start offset: \n"
f"({self.offset_answer_start}, {self.offset_answer_end}) with a span answer. "
"End offset comes before start offset: \n(%s, %s) with a span answer. ",
self.offset_answer_start,
self.offset_answer_end,
)
elif self.offset_answer_end <= 0:
logger.error(
f"Invalid end offset: \n"
f"({self.offset_answer_start}, {self.offset_answer_end}) with a span answer. "
"Invalid end offset: \n(%s, %s) with a span answer. ",
self.offset_answer_start,
self.offset_answer_end,
)
def _create_context_window(self, context_window_size: int, clear_text: str) -> Tuple[str, int, int]:
@ -167,7 +170,8 @@ class QACandidate:
"""
if self.offset_unit != "token":
logger.error(
f"QACandidate needs to have self.offset_unit=token before calling _span_to_string() (id = {self.passage_id})"
"QACandidate needs to have self.offset_unit=token before calling _span_to_string() (id = %s)",
self.passage_id,
)
start_t = self.offset_answer_start

View File

@ -104,8 +104,10 @@ class Trainer:
if use_amp in amp_mapping:
logger.warning(
"The Trainer only supports native PyTorch automatic mixed precision and no longer supports the Apex library.\n"
f"Because you provided Apex optimization level {use_amp}, automatic mixed precision was set to {amp_mapping[use_amp]}.\n"
"In the future, set `use_amp=True` to turn on automatic mixed precision."
"Because you provided Apex optimization level %s, automatic mixed precision was set to %s.\n"
"In the future, set `use_amp=True` to turn on automatic mixed precision.",
use_amp,
amp_mapping[use_amp],
)
use_amp = amp_mapping[use_amp]
else:
@ -570,8 +572,11 @@ class Trainer:
if ranks_with_data < torch.distributed.get_world_size():
if step is not None:
logger.info(
f"Stopping epoch {self.from_epoch} at step {step} for rank {self.local_rank} since at least one other rank "
f"(~ one GPU) in distributed training doesn't have any more batches... "
"Stopping epoch %s at step %s for rank %s since at least one other rank "
"(~ one GPU) in distributed training doesn't have any more batches... ",
self.from_epoch,
step,
self.local_rank,
)
return False
else:

View File

@ -191,13 +191,15 @@ class OpenAIAnswerGenerator(BaseGenerator):
if len(input_docs) == 0:
logger.warning(
f"Skipping all of the provided Documents, as none of them fits the maximum token limit of "
f"{self.MAX_TOKENS_LIMIT}. The generated answers will therefore not be conditioned on any context."
"Skipping all of the provided Documents, as none of them fits the maximum token limit of %s"
"The generated answers will therefore not be conditioned on any context.",
self.MAX_TOKENS_LIMIT,
)
elif skipped_docs >= 1:
logger.warning(
f"Skipping {skipped_docs} of the provided Documents, as using them would exceed the maximum token "
f"limit of {self.MAX_TOKENS_LIMIT}."
"Skipping %s of the provided Documents, as using them would exceed the maximum token limit of %s.",
skipped_docs,
self.MAX_TOKENS_LIMIT,
)
# Top ranked documents should go at the end

View File

@ -131,8 +131,9 @@ class RAGenerator(BaseGenerator):
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
if len(self.devices) > 1:
logger.warning(
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
f"using the first device {self.devices[0]}."
"Multiple devices are not supported in %s inference, using the first device %s.",
self.__class__.__name__,
self.devices[0],
)
self.tokenizer = RagTokenizer.from_pretrained(model_name_or_path, use_auth_token=use_auth_token)
@ -389,8 +390,9 @@ class Seq2SeqGenerator(BaseGenerator):
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
if len(self.devices) > 1:
logger.warning(
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
f"using the first device {self.devices[0]}."
"Multiple devices are not supported in %s inference, using the first device %s.",
self.__class__.__name__,
self.devices[0],
)
Seq2SeqGenerator._register_converters(model_name_or_path, input_converter)

View File

@ -52,8 +52,9 @@ class TextToSpeech:
resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
if len(resolved_devices) > 1:
logger.warning(
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
f"using the first device {resolved_devices[0]}."
"Multiple devices are not supported in %s inference, using the first device %s.",
self.__class__.__name__,
resolved_devices[0],
)
self.model = _Text2SpeechModel.from_pretrained(

View File

@ -311,7 +311,12 @@ class Crawler(BaseComponent):
json.dump(document.to_dict(), f)
except Exception as e:
logging.exception(
f"Crawler can't save the content of '{link}' under '{file_path}'. This webpage will be skipped, but links from this page will still be crawled. Make sure the path above is accessible and the file name is valid. If the file name is invalid, consider setting 'crawler_naming_function' to another function."
"Crawler can't save the content of '%s' under '%s'. "
"This webpage will be skipped, but links from this page will still be crawled. "
"Make sure the path above is accessible and the file name is valid. "
"If the file name is invalid, consider setting 'crawler_naming_function' to another function.",
link,
file_path,
)
paths.append(file_path)

View File

@ -123,15 +123,17 @@ class TransformersDocumentClassifier(BaseDocumentClassifier):
if labels and task == "text-classification":
logger.warning(
f"Provided labels {labels} will be ignored for task text-classification. Set task to "
f"zero-shot-classification to use labels."
"Provided labels %s will be ignored for task text-classification. Set task to "
"zero-shot-classification to use labels.",
labels,
)
resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
if len(resolved_devices) > 1:
logger.warning(
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
f"using the first device {resolved_devices[0]}."
"Multiple devices are not supported in %s inference, using the first device %s.",
self.__class__.__name__,
resolved_devices[0],
)
if tokenizer is None:

View File

@ -75,16 +75,18 @@ class EvalDocuments(BaseComponent):
self.top_k_used = top_k
elif self.top_k_used != top_k:
logger.warning(
f"EvalDocuments was last run with top_k_eval_documents={self.top_k_used} but is "
f"being run again with top_k={self.top_k}. "
f"The evaluation counter is being reset from this point so that the evaluation "
f"metrics are interpretable."
"EvalDocuments was last run with top_k_eval_documents=%s} but is "
"being run again with top_k=%s. "
"The evaluation counter is being reset from this point so that the evaluation "
"metrics are interpretable.",
self.top_k_used,
self.top_k,
)
self.init_counts()
if len(documents) < top_k and not self.too_few_docs_warning:
logger.warning(
f"EvalDocuments is being provided less candidate documents than top_k " f"(currently set to {top_k})."
"EvalDocuments is being provided less candidate documents than top_k (currently set to %s).", top_k
)
self.too_few_docs_warning = True

View File

@ -123,8 +123,9 @@ class EntityExtractor(BaseComponent):
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
if len(self.devices) > 1:
logger.warning(
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
f"using the first device {self.devices[0]}."
"Multiple devices are not supported in %s inference, using the first device %s.",
self.__class__.__name__,
self.devices[0],
)
self.batch_size = batch_size
self.progress_bar = progress_bar

View File

@ -63,8 +63,9 @@ class FileTypeClassifier(BaseComponent):
return mimetypes.guess_extension(extension) or ""
except NameError as ne:
logger.error(
f"The type of '{file_path}' could not be guessed, probably because 'python-magic' is not installed. Ignoring this error."
"Please make sure the necessary OS libraries are installed if you need this functionality ('python-magic' or 'python-magic-bin' on Windows)."
"The type of '%s' could not be guessed, probably because 'python-magic' is not installed. Ignoring this error."
"Please make sure the necessary OS libraries are installed if you need this functionality ('python-magic' or 'python-magic-bin' on Windows).",
file_path,
)
return ""

View File

@ -201,8 +201,10 @@ class AzureConverter(BaseConverter):
file_text += f" {cell}"
if not self.validate_language(file_text, valid_languages):
logger.warning(
f"The language for {file_path} is not one of {valid_languages}. The file may not have "
f"been decoded in the correct text format."
"The language for %s is not one of %s. The file may not have "
"been decoded in the correct text format.",
file_path,
valid_languages,
)
return docs

View File

@ -146,8 +146,9 @@ class ImageToTextConverter(BaseConverter):
document_text = "".join(cleaned_pages)
if not self.validate_language(document_text, valid_languages):
logger.warning(
f"The language for image is not one of {valid_languages}. The file may not have "
f"been decoded in the correct text format."
"The language for image is not one of %s. The file may not have "
"been decoded in the correct text format.",
valid_languages,
)
text = "\f".join(cleaned_pages)

View File

@ -200,8 +200,10 @@ class ParsrConverter(BaseConverter):
file_text += f" {cell}"
if not self.validate_language(file_text, valid_languages):
logger.warning(
f"The language for {file_path} is not one of {valid_languages}. The file may not have "
f"been decoded in the correct text format."
"The language for %s is not one of %s. The file may not have "
"been decoded in the correct text format.",
file_path,
valid_languages,
)
if extract_headlines:

View File

@ -150,8 +150,10 @@ class PDFToTextConverter(BaseConverter):
document_text = "".join(cleaned_pages)
if not self.validate_language(document_text, valid_languages):
logger.warning(
f"The language for {file_path} is not one of {valid_languages}. The file may not have "
f"been decoded in the correct text format."
"The language for %s is not one of %s. The file may not have "
"been decoded in the correct text format.",
file_path,
valid_languages,
)
text = "\f".join(cleaned_pages)

View File

@ -172,8 +172,10 @@ class TikaConverter(BaseConverter):
document_text = "".join(cleaned_pages)
if not self.validate_language(document_text, valid_languages):
logger.warning(
f"The language for {file_path} is not one of {valid_languages}. The file may not have "
f"been decoded in the correct text format."
"The language for %s is not one of %s. The file may not have "
"been decoded in the correct text format.",
file_path,
valid_languages,
)
text = "\f".join(cleaned_pages)

View File

@ -75,8 +75,10 @@ class TextConverter(BaseConverter):
document_text = "".join(cleaned_pages)
if not self.validate_language(document_text, valid_languages):
logger.warning(
f"The language for {file_path} is not one of {valid_languages}. The file may not have "
f"been decoded in the correct text format."
"The language for %s is not one of %s. The file may not have "
"been decoded in the correct text format.",
file_path,
valid_languages,
)
text = "".join(cleaned_pages)

View File

@ -119,8 +119,9 @@ class PseudoLabelGenerator(BaseComponent):
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
if len(self.devices) > 1:
logger.warning(
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
f"using the first device {self.devices[0]}."
"Multiple devices are not supported in %s inference, using the first device %s.",
self.__class__.__name__,
self.devices[0],
)
self.retriever = retriever

View File

@ -106,7 +106,7 @@ class PreProcessor(BasePreProcessor):
try:
nltk.download("punkt")
except FileExistsError as error:
logger.debug(f"NLTK punkt tokenizer seems to be already downloaded. Error message: {error}")
logger.debug("NLTK punkt tokenizer seems to be already downloaded. Error message: %s", error)
pass
self.clean_whitespace = clean_whitespace
self.clean_header_footer = clean_header_footer
@ -747,14 +747,16 @@ class PreProcessor(BasePreProcessor):
# NLTK failed to load custom SentenceTokenizer, fallback to the default model or to English
if language_name is not None:
logger.error(
f"PreProcessor couldn't find custom sentence tokenizer model for {self.language}. "
f"Using default {self.language} model."
"PreProcessor couldn't find custom sentence tokenizer model for %s. Using default %s model.",
self.language,
self.language,
)
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/{language_name}.pickle")
else:
logger.error(
f"PreProcessor couldn't find default or custom sentence tokenizer model for {self.language}. "
f"Using English instead."
"PreProcessor couldn't find default or custom sentence tokenizer model for %s. "
"Using English instead.",
self.language,
)
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/english.pickle")
@ -763,8 +765,9 @@ class PreProcessor(BasePreProcessor):
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/{language_name}.pickle")
else:
logger.error(
f"PreProcessor couldn't find the default sentence tokenizer model for {self.language}. "
f" Using English instead. You may train your own model and use the 'tokenizer_model_folder' parameter."
"PreProcessor couldn't find the default sentence tokenizer model for %s. "
" Using English instead. You may train your own model and use the 'tokenizer_model_folder' parameter.",
self.language,
)
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/english.pickle")

View File

@ -125,8 +125,11 @@ class PromptTemplate(BasePromptTemplate, ABC):
if args:
if len(args) != len(self.prompt_params):
logger.warning(
f"For {self.name}, expected {self.prompt_params} arguments, instead "
f"got {len(args)} arguments {args}"
"For %s, expected %s arguments, instead got %s arguments %s",
self.name,
self.prompt_params,
len(args),
args,
)
for prompt_param, arg in zip(self.prompt_params, args):
template_dict[prompt_param] = [arg] if isinstance(arg, str) else arg
@ -229,8 +232,9 @@ class HFLocalInvocationLayer(PromptModelInvocationLayer):
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
if len(self.devices) > 1:
logger.warning(
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
f"using the first device {self.devices[0]}."
"Multiple devices are not supported in %s inference, using the first device %s.",
self.__class__.__name__,
self.devices[0],
)
# Due to reflective construction of all invocation layers we might receive some

View File

@ -100,8 +100,9 @@ class TransformersQueryClassifier(BaseQueryClassifier):
resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
if len(resolved_devices) > 1:
logger.warning(
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
f"using the first device {resolved_devices[0]}."
"Multiple devices are not supported in %s inference, using the first device %s.",
self.__class__.__name__,
resolved_devices[0],
)
self.model = pipeline(

View File

@ -81,8 +81,9 @@ class QuestionGenerator(BaseComponent):
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
if len(self.devices) > 1:
logger.warning(
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
f"using the first device {self.devices[0]}."
"Multiple devices are not supported in %s inference, using the first device %s.",
self.__class__.__name__,
self.devices[0],
)
self.model = AutoModelForSeq2SeqLM.from_pretrained(
model_name_or_path, revision=model_version, use_auth_token=use_auth_token

View File

@ -1023,9 +1023,10 @@ class FARMReader(BaseReader):
if self.top_k_per_candidate != 4:
logger.info(
f"Performing Evaluation using top_k_per_candidate = {self.top_k_per_candidate} \n"
f"and consequently, QuestionAnsweringPredictionHead.n_best = {self.top_k_per_candidate + 1}. \n"
f"This deviates from FARM's default where QuestionAnsweringPredictionHead.n_best = 5"
"Performing Evaluation using top_k_per_candidate = %s \n"
"and consequently, QuestionAnsweringPredictionHead.n_best = {self.top_k_per_candidate + 1}. \n"
"This deviates from FARM's default where QuestionAnsweringPredictionHead.n_best = 5",
self.top_k_per_candidate,
)
# extract all questions for evaluation
@ -1062,7 +1063,7 @@ class FARMReader(BaseReader):
continue
if label.answer.offsets_in_document is None:
logger.error(
f"Label.answer.offsets_in_document was None, but Span object was expected: {label} "
"Label.answer.offsets_in_document was None, but Span object was expected: %s ", label
)
continue
# add to existing answers
@ -1074,7 +1075,11 @@ class FARMReader(BaseReader):
# Hack to fix problem where duplicate questions are merged by doc_store processing creating a QA example with 8 annotations > 6 annotation max
if len(aggregated_per_question[aggregation_key]["answers"]) >= 6:
logger.warning(
f"Answers in this sample are being dropped because it has more than 6 answers. (doc_id: {doc_id}, question: {label.query}, label_id: {label.id})"
"Answers in this sample are being dropped because it has more than 6 answers. "
"(doc_id: %s, question: %s, label_id: %s)",
doc_id,
label.query,
label.id,
)
continue
aggregated_per_question[aggregation_key]["answers"].append(

View File

@ -116,8 +116,9 @@ class TableReader(BaseReader):
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
if len(self.devices) > 1:
logger.warning(
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
f"using the first device {self.devices[0]}."
"Multiple devices are not supported in %s inference, using the first device %s.",
self.__class__.__name__,
self.devices[0],
)
config = TapasConfig.from_pretrained(model_name_or_path, use_auth_token=use_auth_token)
@ -646,8 +647,9 @@ class RCIReader(BaseReader):
self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False)
if len(self.devices) > 1:
logger.warning(
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
f"using the first device {self.devices[0]}."
"Multiple devices are not supported in %s inference, using the first device %s.",
self.__class__.__name__,
self.devices[0],
)
self.row_model = AutoModelForSequenceClassification.from_pretrained(

View File

@ -86,8 +86,9 @@ class TransformersReader(BaseReader):
if len(self.devices) > 1:
logger.warning(
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
f"using the first device {self.devices[0]}."
"Multiple devices are not supported in %s inference, using the first device %s.",
self.__class__.__name__,
self.devices[0],
)
self.model = pipeline(

View File

@ -104,14 +104,18 @@ class _BaseEmbeddingEncoder:
if model_similarity is not None and document_store.similarity != model_similarity:
logger.warning(
f"You seem to be using {model_name} model with the {document_store.similarity} function instead of the recommended {model_similarity}. "
f"This can be set when initializing the DocumentStore"
"You seem to be using %s model with the %s function instead of the recommended %s. "
"This can be set when initializing the DocumentStore",
model_name,
document_store.similarity,
model_similarity,
)
elif "dpr" in model_name.lower() and document_store.similarity != "dot_product":
logger.warning(
f"You seem to be using a DPR model with the {document_store.similarity} function. "
f"We recommend using dot_product instead. "
f"This can be set when initializing the DocumentStore"
"You seem to be using a DPR model with the %s function. "
"We recommend using dot_product instead. "
"This can be set when initializing the DocumentStore",
document_store.similarity,
)

View File

@ -245,9 +245,8 @@ class BaseRetriever(BaseComponent):
mean_avg_precision = summed_avg_precision / number_of_questions
logger.info(
(
f"For {correct_retrievals} out of {number_of_questions} questions ({recall:.2%}), the answer was in"
f" the top-{top_k} candidate passages selected by the retriever."
"For {} out of {} questions ({:.2%}), the answer was in the top-{} candidate passages selected by the retriever.".format(
correct_retrievals, number_of_questions, recall, top_k
)
)

View File

@ -178,9 +178,10 @@ class DensePassageRetriever(DenseRetriever):
if document_store and document_store.similarity != "dot_product":
logger.warning(
f"You are using a Dense Passage Retriever model with the {document_store.similarity} function. "
"You are using a Dense Passage Retriever model with the %s function. "
"We recommend you use dot_product instead. "
"This can be set when initializing the DocumentStore"
"This can be set when initializing the DocumentStore",
document_store.similarity,
)
# Init & Load Encoders
@ -550,8 +551,9 @@ class DensePassageRetriever(DenseRetriever):
"""
if self.processor.num_hard_negatives != 0:
logger.warning(
f"'num_hard_negatives' is set to {self.processor.num_hard_negatives}, but inference does "
f"not require any hard negatives. Setting num_hard_negatives to 0."
"'num_hard_negatives' is set to %s, but inference does "
"not require any hard negatives. Setting num_hard_negatives to 0.",
self.processor.num_hard_negatives,
)
self.processor.num_hard_negatives = 0
@ -1163,8 +1165,9 @@ class TableTextRetriever(DenseRetriever):
if self.processor.num_hard_negatives != 0:
logger.warning(
f"'num_hard_negatives' is set to {self.processor.num_hard_negatives}, but inference does "
f"not require any hard negatives. Setting num_hard_negatives to 0."
"'num_hard_negatives' is set to %s, but inference does "
"not require any hard negatives. Setting num_hard_negatives to 0.",
self.processor.num_hard_negatives,
)
self.processor.num_hard_negatives = 0
@ -1532,10 +1535,11 @@ class EmbeddingRetriever(DenseRetriever):
and model_format != "sentence_transformers"
):
logger.warning(
f"You seem to be using a Sentence Transformer embedding model but 'model_format' is set to '{self.model_format}'."
f" You may need to set model_format='sentence_transformers' to ensure correct loading of model."
f"As an alternative, you can let Haystack derive the format automatically by not setting the "
f"'model_format' parameter at all."
"You seem to be using a Sentence Transformer embedding model but 'model_format' is set to '%s'."
" You may need to set model_format='sentence_transformers' to ensure correct loading of model."
"As an alternative, you can let Haystack derive the format automatically by not setting the "
"'model_format' parameter at all.",
self.model_format,
)
self.embedding_encoder = _EMBEDDING_ENCODERS[self.model_format](retriever=self)

View File

@ -96,8 +96,9 @@ class TransformersSummarizer(BaseSummarizer):
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
if len(self.devices) > 1:
logger.warning(
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
f"using the first device {self.devices[0]}."
"Multiple devices are not supported in %s} inference, using the first device %s.",
self.__class__.__name__,
self.devices[0],
)
if tokenizer is None:

View File

@ -83,8 +83,9 @@ class TransformersTranslator(BaseTranslator):
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
if len(self.devices) > 1:
logger.warning(
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
f"using the first device {self.devices[0]}."
"Multiple devices are not supported in %s inference, using the first device %s.",
self.__class__.__name__,
self.devices[0],
)
self.max_seq_len = max_seq_len

View File

@ -293,10 +293,13 @@ class Pipeline:
for document_store in document_stores:
if document_store["type"] != "DeepsetCloudDocumentStore":
logger.info(
f"In order to be used on Deepset Cloud, component '{document_store['name']}' of type '{document_store['type']}' "
f"has been automatically converted to type DeepsetCloudDocumentStore. "
f"Usually this replacement will result in equivalent pipeline quality. "
f"However depending on chosen settings of '{document_store['name']}' differences might occur."
"In order to be used on Deepset Cloud, component '%s' of type '%s' "
"has been automatically converted to type DeepsetCloudDocumentStore. "
"Usually this replacement will result in equivalent pipeline quality. "
"However depending on chosen settings of '%s' differences might occur.",
document_store["name"],
document_store["type"],
document_store["name"],
)
document_store["type"] = "DeepsetCloudDocumentStore"
document_store["params"] = {}
@ -784,7 +787,7 @@ class Pipeline:
# crop dataset if `dataset_size` is provided and is valid
if num_documents is not None and 0 < num_documents < len(corpus):
logger.info(f"Cropping dataset from {len(corpus)} to {num_documents} documents")
logger.info("Cropping dataset from %s to %s documents", len(corpus), num_documents)
corpus = dict(itertools.islice(corpus.items(), num_documents))
# Remove queries that don't contain the remaining documents
corpus_ids = set(list(corpus.keys()))
@ -800,8 +803,9 @@ class Pipeline:
qrels = qrels_new
elif num_documents is not None and (num_documents < 1 or num_documents > len(corpus)):
logging.warning(
f"'num_documents' variable should be lower than corpus length and have a positive value, but it's {num_documents}."
" Dataset size remains unchanged."
"'num_documents' variable should be lower than corpus length and have a positive value, but it's %s."
" Dataset size remains unchanged.",
num_documents,
)
# check index before eval

View File

@ -82,7 +82,11 @@ def get_component_definitions(
param_name = key.replace(env_prefix, "").lower()
component_definition["params"][param_name] = value
logger.info(
f"Param '{param_name}' of component '{name}' overwritten with environment variable '{key}' value '{value}'."
"Param '%s' of component '%s' overwritten with environment variable '%s' value '%s'.",
param_name,
name,
key,
value,
)
return component_definitions
@ -291,11 +295,13 @@ def validate_schema(pipeline_config: Dict, strict_version_check: bool = False, e
ok_to_ignore_version = pipeline_version == "ignore" and "rc" in __version__
if not ok_to_ignore_version:
logging.warning(
f"This pipeline is version '{pipeline_version}', but you're using Haystack {__version__}\n"
"This pipeline is version '%s', but you're using Haystack %s\n"
"This might cause bugs and unexpected behaviors."
"Please check out the release notes (https://github.com/deepset-ai/haystack/releases/latest), "
"the documentation (https://haystack.deepset.ai/components/pipelines#yaml-file-definitions) "
"and fix your configuration accordingly."
"and fix your configuration accordingly.",
pipeline_version,
__version__,
)
# Load the json schema, and create one if it doesn't exist yet
@ -317,7 +323,8 @@ def validate_schema(pipeline_config: Dict, strict_version_check: bool = False, e
if validation.instance["type"] not in loaded_custom_nodes:
logger.info(
f"Missing definition for node of type {validation.instance['type']}. Looking into local classes..."
"Missing definition for node of type %s. Looking into local classes...",
validation.instance["type"],
)
missing_component_class = BaseComponent.get_subclass(validation.instance["type"])
schema = inject_definition_in_schema(node_class=missing_component_class, schema=schema)

View File

@ -1092,8 +1092,10 @@ class EvaluationResult:
query_answers = answers[answers["multilabel_id"] == multilabel_id]
if answer_metric not in metrics:
logger.warning(
f"You specified an answer_metric={answer_metric} not available in calculated metrics={metrics.keys()}."
f"Skipping collection of worst performing samples."
"You specified an answer_metric=%s not available in calculated metrics=%s."
"Skipping collection of worst performing samples.",
answer_metric,
metrics.keys(),
)
break
if metrics[answer_metric] <= answer_metric_threshold:
@ -1127,8 +1129,10 @@ class EvaluationResult:
for multilabel_id, metrics in worst_df.iterrows():
if document_metric not in metrics:
logger.warning(
f"You specified a document_metric={document_metric} not available in calculated metrics={metrics.keys()}."
f"Skipping collection of worst performing samples."
"You specified a document_metric=%s not available in calculated metrics=%s."
"Skipping collection of worst performing samples.",
document_metric,
metrics.keys(),
)
break
if metrics[document_metric] <= document_metric_threshold:
@ -1185,9 +1189,9 @@ class EvaluationResult:
document_relevance_criterion = answer_scope_to_doc_relevance_crit.get(answer_scope, document_scope)
elif answer_scope in answer_scope_to_doc_relevance_crit.keys():
logger.warning(
f"You specified a non-answer document_scope together with a non-default answer_scope. "
f"This may result in inconsistencies between answer and document metrics. "
f"To enforce the same definition of correctness for both, document_scope must be one of {['answer', 'document_id_or_answer']}."
"You specified a non-answer document_scope together with a non-default answer_scope. "
"This may result in inconsistencies between answer and document metrics. "
"To enforce the same definition of correctness for both, document_scope must be one of 'answer', 'document_id_or_answer'."
)
return document_relevance_criterion # type: ignore[return-value]

View File

@ -250,7 +250,11 @@ def _write_telemetry_config():
# show a log message if telemetry config is written for the first time
if not CONFIG_PATH.is_file():
logger.info(
f"Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by calling disable_telemetry() or by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation page. More information at https://docs.haystack.deepset.ai/docs/telemetry"
"Haystack sends anonymous usage data to understand the actual usage and steer dev efforts "
"towards features that are most meaningful to users. You can opt-out at anytime by calling "
"disable_telemetry() or by manually setting the environment variable "
"HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation "
"page. More information at https://docs.haystack.deepset.ai/docs/telemetry"
)
CONFIG_PATH.parents[0].mkdir(parents=True, exist_ok=True)
user_id = _get_or_create_user_id()

View File

@ -420,7 +420,11 @@ class IndexClient:
doc = response.json()
else:
logger.warning(
f"Document {id} could not be fetched from deepset Cloud: HTTP {response.status_code} - {response.reason}\n{response.content.decode()}"
"Document %s could not be fetched from deepset Cloud: HTTP %s - %s\n%s",
id,
response.status_code,
response.reason,
response.content.decode(),
)
return doc
@ -625,7 +629,9 @@ class PipelineClient:
else:
logger.info("Pipeline config '%s' is already deployed.", pipeline_config_name)
logger.info(
f"Search endpoint for pipeline config '{pipeline_config_name}' is up and running for you under {pipeline_url}"
"Search endpoint for pipeline config '%s' is up and running for you under %s",
pipeline_config_name,
pipeline_url,
)
if show_curl_message:
curl_cmd = (
@ -925,7 +931,9 @@ class EvaluationSetClient:
with open(file_path, "rb") as file:
self.client.post(url=target_url, files={"file": (file_path.name, file, mime_type)})
logger.info(
f"Successfully uploaded evaluation set file {file_path}. You can access it now under evaluation set '{file_path.name}'."
"Successfully uploaded evaluation set file %s. You can access it now under evaluation set '%s'.",
file_path,
file_path.name,
)
except DeepsetCloudError as e:
logger.error("Error uploading evaluation set file %s: %s", file_path, e.args)

View File

@ -87,8 +87,9 @@ def stop_container(container_name, delete_container=False):
status = subprocess.run([f"docker stop {container_name}"], shell=True)
if status.returncode:
logger.warning(
f"Tried to stop {container_name} but this failed. "
f"It is likely that there was no Docker container with the name {container_name}"
"Tried to stop %s but this failed. It is likely that there was no Docker container with the name %s",
container_name,
container_name,
)
if delete_container:
status = subprocess.run([f"docker rm {container_name}"], shell=True)

View File

@ -61,8 +61,8 @@ def print_answers(results: dict, details: str = "all", max_text_len: Optional[in
filtered_answers = answers
else:
valid_values = ", ".join(fields_to_keep_by_level.keys()) + " and 'all'"
logging.warn(f"print_answers received details='{details}', which was not understood. ")
logging.warn(f"Valid values are {valid_values}. Using 'all'.")
logging.warn("print_answers received details='%s', which was not understood. ", details)
logging.warn("Valid values are %s. Using 'all'.", valid_values)
filtered_answers = answers
# Shorten long text fields

View File

@ -50,8 +50,11 @@ def retry_with_exponential_backoff(
# Sleep for the delay
logger.warning(
f"{e.__class__.__name__ } - {e}, "
f"retry {function.__name__} in {'{0:.2f}'.format(sleep_time)} seconds..."
"%s - %s, retry %s in %s seconds...",
e.__class__.__name__,
e,
function.__name__,
"{0:.2f}".format(sleep_time),
)
time.sleep(sleep_time)

View File

@ -178,7 +178,7 @@ def create_dpr_training_dataset(squad_data: dict, retriever: BaseRetriever, num_
if not hard_negative_ctxs or not positive_ctxs:
logging.error(
f"No retrieved candidates for article {article_title}, with question {question['question']}"
"No retrieved candidates for article %s, with question %s", article_title, question["question"]
)
n_non_added_questions += 1
continue

View File

@ -260,7 +260,6 @@ disable = [
"too-few-public-methods",
"raise-missing-from",
"invalid-name",
"logging-fstring-interpolation",
"too-many-locals",
"duplicate-code",
"too-many-arguments",

View File

@ -171,8 +171,9 @@ def export_feedback(
context = squad_label["paragraphs"][0]["context"]
if not context[start : start + len(answer)] == answer:
logger.error(
f"Skipping invalid squad label as string via offsets "
f"('{context[start:start + len(answer)]}') does not match answer string ('{answer}') "
"Skipping invalid squad label as string via offsets ('%s') does not match answer string ('%s') ",
context[start : start + len(answer)],
answer,
)
export_data.append(squad_label)

View File

@ -97,22 +97,27 @@ def _format_filters(filters):
new_filters = {}
if filters is None:
logger.warning(
f"Request with deprecated filter format ('\"filters\": null'). "
f"Remove empty filters from params to be compliant with future versions"
"Request with deprecated filter format ('\"filters\": null'). "
"Remove empty filters from params to be compliant with future versions"
)
else:
for key, values in filters.items():
if values is None:
logger.warning(
f"Request with deprecated filter format ('{key}: null'). "
f"Remove null values from filters to be compliant with future versions"
"Request with deprecated filter format ('%s: null'). "
"Remove null values from filters to be compliant with future versions",
key,
)
continue
if not isinstance(values, list):
logger.warning(
f"Request with deprecated filter format ('{key}': {values}). "
f"Change to '{key}':[{values}]' to be compliant with future versions"
"Request with deprecated filter format ('%s': %s). "
"Change to '%s':[%s]' to be compliant with future versions",
key,
values,
key,
values,
)
values = [values]