mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-15 09:58:43 +00:00
chore: enable logging-fstring-interpolation
and cleanup (#3843)
* enable logging-fstring-interpolation * remove logging-fstring-interpolation from exclusion list * remove implicit string interpolations added by black * remove from rest_api too * fix % sign
This commit is contained in:
parent
4cbc8550d6
commit
d157e41c1f
@ -476,8 +476,9 @@ class BaseDocumentStore(BaseComponent):
|
||||
else:
|
||||
jsonl_filename = (file_path.parent / (file_path.stem + ".jsonl")).as_posix()
|
||||
logger.info(
|
||||
f"Adding evaluation data batch-wise is not compatible with json-formatted SQuAD files. "
|
||||
f"Converting json to jsonl to: {jsonl_filename}"
|
||||
"Adding evaluation data batch-wise is not compatible with json-formatted SQuAD files. "
|
||||
"Converting json to jsonl to: %s",
|
||||
jsonl_filename,
|
||||
)
|
||||
squad_json_to_jsonl(filename, jsonl_filename)
|
||||
self.add_eval_data(
|
||||
@ -622,8 +623,9 @@ class BaseDocumentStore(BaseComponent):
|
||||
for document in documents:
|
||||
if document.id in _hash_ids:
|
||||
logger.info(
|
||||
f"Duplicate Documents: Document with id '{document.id}' already exists in index "
|
||||
f"'{index or self.index}'"
|
||||
"Duplicate Documents: Document with id '%s' already exists in index '%s'",
|
||||
document.id,
|
||||
index or self.index,
|
||||
)
|
||||
continue
|
||||
_documents.append(document)
|
||||
|
@ -118,23 +118,25 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
|
||||
indexing_info = index_info["indexing"]
|
||||
if indexing_info["pending_file_count"] > 0:
|
||||
logger.warning(
|
||||
f"{indexing_info['pending_file_count']} files are pending to be indexed. "
|
||||
f"Indexing status: {indexing_info['status']}"
|
||||
"%s files are pending to be indexed. Indexing status: %s",
|
||||
indexing_info["pending_file_count"],
|
||||
indexing_info["status"],
|
||||
)
|
||||
if index in deployed_unhealthy_pipelines:
|
||||
logger.warning(
|
||||
f"The index '{index}' is unhealthy and should be redeployed using "
|
||||
f"`Pipeline.undeploy_on_deepset_cloud()` and `Pipeline.deploy_on_deepset_cloud()`."
|
||||
"The index '%s' is unhealthy and should be redeployed using "
|
||||
"`Pipeline.undeploy_on_deepset_cloud()` and `Pipeline.deploy_on_deepset_cloud()`.",
|
||||
index,
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
f"You are using a DeepsetCloudDocumentStore with an index that does not exist on deepset Cloud. "
|
||||
f"This document store always returns empty responses. This can be useful if you want to "
|
||||
f"create a new pipeline within deepset Cloud.\n"
|
||||
f"In order to create a new pipeline on deepset Cloud, take the following steps: \n"
|
||||
f" - create query and indexing pipelines using this DocumentStore\n"
|
||||
f" - call `Pipeline.save_to_deepset_cloud()` passing the pipelines and a `pipeline_config_name`\n"
|
||||
f" - call `Pipeline.deploy_on_deepset_cloud()` passing the `pipeline_config_name`"
|
||||
"You are using a DeepsetCloudDocumentStore with an index that does not exist on deepset Cloud. "
|
||||
"This document store always returns empty responses. This can be useful if you want to "
|
||||
"create a new pipeline within deepset Cloud.\n"
|
||||
"In order to create a new pipeline on deepset Cloud, take the following steps: \n"
|
||||
" - create query and indexing pipelines using this DocumentStore\n"
|
||||
" - call `Pipeline.save_to_deepset_cloud()` passing the pipelines and a `pipeline_config_name`\n"
|
||||
" - call `Pipeline.deploy_on_deepset_cloud()` passing the `pipeline_config_name`"
|
||||
)
|
||||
|
||||
self.evaluation_set_client = DeepsetCloud.get_evaluation_set_client(
|
||||
|
@ -508,9 +508,10 @@ class ElasticsearchDocumentStore(SearchEngineDocumentStore):
|
||||
|
||||
if not any(indices):
|
||||
logger.warning(
|
||||
f"To use an index, you must create it first. The index called '{index_name}' doesn't exist. "
|
||||
f"You can create it by setting `create_index=True` on init or by calling `write_documents()` if you prefer to create it on demand. "
|
||||
f"Note that this instance doesn't validate the index after you create it."
|
||||
"To use an index, you must create it first. The index called '%s' doesn't exist. "
|
||||
"You can create it by setting `create_index=True` on init or by calling `write_documents()` if you prefer to create it on demand. "
|
||||
"Note that this instance doesn't validate the index after you create it.",
|
||||
index_name,
|
||||
)
|
||||
|
||||
# If the index name is an alias that groups multiple existing indices, each of them must have an embedding_field.
|
||||
|
@ -206,7 +206,10 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
index.hnsw.efConstruction = ef_construction
|
||||
|
||||
logger.info(
|
||||
f"HNSW params: n_links: {n_links}, efSearch: {index.hnsw.efSearch}, efConstruction: {index.hnsw.efConstruction}"
|
||||
"HNSW params: n_links: %s, efSearch: %s, efConstruction: %s",
|
||||
n_links,
|
||||
index.hnsw.efSearch,
|
||||
index.hnsw.efConstruction,
|
||||
)
|
||||
else:
|
||||
index = faiss.index_factory(embedding_dim, index_factory, metric_type)
|
||||
@ -550,8 +553,10 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
"""
|
||||
if index == self.index:
|
||||
logger.warning(
|
||||
f"Deletion of default index '{index}' detected. "
|
||||
f"If you plan to use this index again, please reinstantiate '{self.__class__.__name__}' in order to avoid side-effects."
|
||||
"Deletion of default index '%s' detected. "
|
||||
"If you plan to use this index again, please reinstantiate '%s' in order to avoid side-effects.",
|
||||
index,
|
||||
self.__class__.__name__,
|
||||
)
|
||||
if index in self.faiss_indexes:
|
||||
del self.faiss_indexes[index]
|
||||
|
@ -110,8 +110,9 @@ class InMemoryDocumentStore(KeywordDocumentStore):
|
||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=self.use_gpu, multi_gpu=False)
|
||||
if len(self.devices) > 1:
|
||||
logger.warning(
|
||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
||||
f"using the first device {self.devices[0]}."
|
||||
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||
self.__class__.__name__,
|
||||
self.devices[0],
|
||||
)
|
||||
|
||||
self.main_device = self.devices[0]
|
||||
@ -184,7 +185,7 @@ class InMemoryDocumentStore(KeywordDocumentStore):
|
||||
)
|
||||
if duplicate_documents == "skip":
|
||||
logger.warning(
|
||||
f"Duplicate Documents: Document with id '{document.id} already exists in index " f"'{index}'"
|
||||
"Duplicate Documents: Document with id '%s' already exists in index '%s'", document.id, index
|
||||
)
|
||||
continue
|
||||
self.indexes[index][document.id] = document
|
||||
@ -205,8 +206,9 @@ class InMemoryDocumentStore(KeywordDocumentStore):
|
||||
textual_documents = [doc for doc in all_documents if doc.content_type == "text"]
|
||||
if len(textual_documents) < len(all_documents):
|
||||
logger.warning(
|
||||
f"Some documents in {index} index are non-textual."
|
||||
f" They will be written to the index, but the corresponding BM25 representations will not be generated."
|
||||
"Some documents in %s index are non-textual."
|
||||
" They will be written to the index, but the corresponding BM25 representations will not be generated.",
|
||||
index,
|
||||
)
|
||||
|
||||
tokenized_corpus = [
|
||||
@ -236,10 +238,11 @@ class InMemoryDocumentStore(KeywordDocumentStore):
|
||||
duplicate_ids: list = [label.id for label in self._get_duplicate_labels(label_objects, index=index)]
|
||||
if len(duplicate_ids) > 0:
|
||||
logger.warning(
|
||||
f"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
|
||||
f" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
|
||||
f" the answer annotation and not the question."
|
||||
f" Problematic ids: {','.join(duplicate_ids)}"
|
||||
"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
|
||||
" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
|
||||
" the answer annotation and not the question."
|
||||
" Problematic ids: %s",
|
||||
",".join(duplicate_ids),
|
||||
)
|
||||
|
||||
for label in label_objects:
|
||||
|
@ -497,8 +497,10 @@ class MilvusDocumentStore(SQLDocumentStore):
|
||||
"""
|
||||
if index == self.index:
|
||||
logger.warning(
|
||||
f"Deletion of default index '{index}' detected. "
|
||||
f"If you plan to use this index again, please reinstantiate '{self.__class__.__name__}' in order to avoid side-effects."
|
||||
"Deletion of default index '%s' detected. "
|
||||
"If you plan to use this index again, please reinstantiate '%s' in order to avoid side-effects.",
|
||||
index,
|
||||
self.__class__.__name__,
|
||||
)
|
||||
self._delete_index(index)
|
||||
|
||||
|
@ -527,9 +527,10 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
||||
if not any(indices):
|
||||
# We don't want to raise here as creating a query-only document store before the index being created asynchronously is a valid use case.
|
||||
logger.warning(
|
||||
f"Before you can use an index, you must create it first. The index '{index_name}' doesn't exist. "
|
||||
f"You can create it by setting `create_index=True` on init or by calling `write_documents()` if you prefer to create it on demand. "
|
||||
f"Note that this instance doesn't validate the index after you created it."
|
||||
"Before you can use an index, you must create it first. The index '%s' doesn't exist. "
|
||||
"You can create it by setting `create_index=True` on init or by calling `write_documents()` if you prefer to create it on demand. "
|
||||
"Note that this instance doesn't validate the index after you created it.",
|
||||
index_name,
|
||||
)
|
||||
|
||||
# If the index name is an alias that groups multiple existing indices, each of them must have an embedding_field.
|
||||
@ -583,11 +584,11 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
||||
if self.index_type == "hnsw" and ef_search != 20:
|
||||
body = {"knn.algo_param.ef_search": 20}
|
||||
self.client.indices.put_settings(index=index_id, body=body, headers=headers)
|
||||
logger.info(f"Set ef_search to 20 for hnsw index '{index_id}'.")
|
||||
logger.info("Set ef_search to 20 for hnsw index '%s'.", index_id)
|
||||
elif self.index_type == "flat" and ef_search != 512:
|
||||
body = {"knn.algo_param.ef_search": 512}
|
||||
self.client.indices.put_settings(index=index_id, body=body, headers=headers)
|
||||
logger.info(f"Set ef_search to 512 for hnsw index '{index_id}'.")
|
||||
logger.info("Set ef_search to 512 for hnsw index '%s'.", index_id)
|
||||
|
||||
def _validate_approximate_knn_settings(
|
||||
self, existing_embedding_field: Dict[str, Any], index_settings: Dict[str, Any], index_id: str
|
||||
|
@ -216,7 +216,10 @@ class SearchEngineDocumentStore(KeywordDocumentStore):
|
||||
except Exception as e:
|
||||
if hasattr(e, "status_code") and e.status_code == 429: # type: ignore
|
||||
logger.warning(
|
||||
f"Failed to insert a batch of '{len(documents)}' documents because of a 'Too Many Requeset' response. Splitting the number of documents into two chunks with the same size and retrying in {_timeout} seconds."
|
||||
"Failed to insert a batch of '%s' documents because of a 'Too Many Requeset' response. "
|
||||
"Splitting the number of documents into two chunks with the same size and retrying in %s seconds.",
|
||||
len(documents),
|
||||
_timeout,
|
||||
)
|
||||
if len(documents) == 1:
|
||||
logger.warning(
|
||||
@ -478,10 +481,11 @@ class SearchEngineDocumentStore(KeywordDocumentStore):
|
||||
duplicate_ids: list = [label.id for label in self._get_duplicate_labels(label_list, index=index)]
|
||||
if len(duplicate_ids) > 0:
|
||||
logger.warning(
|
||||
f"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
|
||||
f" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
|
||||
f" the answer annotation and not the question."
|
||||
f" Problematic ids: {','.join(duplicate_ids)}"
|
||||
"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
|
||||
" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
|
||||
" the answer annotation and not the question."
|
||||
" Problematic ids: %s",
|
||||
",".join(duplicate_ids),
|
||||
)
|
||||
labels_to_index = []
|
||||
for label in label_list:
|
||||
@ -1087,7 +1091,8 @@ class SearchEngineDocumentStore(KeywordDocumentStore):
|
||||
if not isinstance(query, str):
|
||||
logger.warning(
|
||||
"The query provided seems to be not a string, but an object "
|
||||
f"of type {type(query)}. This can cause the query to fail."
|
||||
"of type %s. This can cause the query to fail.",
|
||||
type(query),
|
||||
)
|
||||
operator = "AND" if all_terms_must_match else "OR"
|
||||
body = {
|
||||
@ -1599,8 +1604,10 @@ class SearchEngineDocumentStore(KeywordDocumentStore):
|
||||
"""
|
||||
if index == self.index:
|
||||
logger.warning(
|
||||
f"Deletion of default index '{index}' detected. "
|
||||
f"If you plan to use this index again, please reinstantiate '{self.__class__.__name__}' in order to avoid side-effects."
|
||||
"Deletion of default index '%s' detected. "
|
||||
"If you plan to use this index again, please reinstantiate '%s' in order to avoid side-effects.",
|
||||
index,
|
||||
self.__class__.__name__,
|
||||
)
|
||||
self._delete_index(index)
|
||||
|
||||
|
@ -448,10 +448,11 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
duplicate_ids: list = [label.id for label in self._get_duplicate_labels(labels, index=index)]
|
||||
if len(duplicate_ids) > 0:
|
||||
logger.warning(
|
||||
f"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
|
||||
f" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
|
||||
f" the answer annotation and not the question."
|
||||
f" Problematic ids: {','.join(duplicate_ids)}"
|
||||
"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
|
||||
" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
|
||||
" the answer annotation and not the question."
|
||||
" Problematic ids: %s",
|
||||
",".join(duplicate_ids),
|
||||
)
|
||||
# TODO: Use batch_size
|
||||
|
||||
|
@ -52,8 +52,9 @@ def eval_data_from_json(
|
||||
problematic_ids.extend(cur_problematic_ids)
|
||||
if len(problematic_ids) > 0:
|
||||
logger.warning(
|
||||
f"Could not convert an answer for {len(problematic_ids)} questions.\n"
|
||||
f"There were conversion errors for question ids: {problematic_ids}"
|
||||
"Could not convert an answer for %s questions.\nThere were conversion errors for question ids: %s",
|
||||
len(problematic_ids),
|
||||
problematic_ids,
|
||||
)
|
||||
return docs, labels
|
||||
|
||||
@ -99,8 +100,10 @@ def eval_data_from_jsonl(
|
||||
if len(docs) >= batch_size:
|
||||
if len(problematic_ids) > 0:
|
||||
logger.warning(
|
||||
f"Could not convert an answer for {len(problematic_ids)} questions.\n"
|
||||
f"There were conversion errors for question ids: {problematic_ids}"
|
||||
"Could not convert an answer for %s questions.\n"
|
||||
"There were conversion errors for question ids: %s",
|
||||
len(problematic_ids),
|
||||
problematic_ids,
|
||||
)
|
||||
yield docs, labels
|
||||
docs = []
|
||||
|
@ -358,7 +358,9 @@ class WeaviateDocumentStore(KeywordDocumentStore):
|
||||
generated_uuid = str(uuid.UUID(hashed_id.hexdigest()[::2]))
|
||||
if not self.uuid_format_warning_raised:
|
||||
logger.warning(
|
||||
f"Document id {id} is not in uuid format. Such ids will be replaced by uuids, in this case {generated_uuid}."
|
||||
"Document id %s is not in uuid format. Such ids will be replaced by uuids, in this case %s.",
|
||||
id,
|
||||
generated_uuid,
|
||||
)
|
||||
self.uuid_format_warning_raised = True
|
||||
id = generated_uuid
|
||||
@ -1507,8 +1509,10 @@ class WeaviateDocumentStore(KeywordDocumentStore):
|
||||
"""
|
||||
if index == self.index:
|
||||
logger.warning(
|
||||
f"Deletion of default index '{index}' detected. "
|
||||
f"If you plan to use this index again, please reinstantiate '{self.__class__.__name__}' in order to avoid side-effects."
|
||||
"Deletion of default index '%s' detected. "
|
||||
"If you plan to use this index again, please reinstantiate '%s' in order to avoid side-effects.",
|
||||
index,
|
||||
self.__class__.__name__,
|
||||
)
|
||||
self._delete_index(index)
|
||||
|
||||
|
@ -336,7 +336,9 @@ class DataSilo:
|
||||
logger.warning("No dev set created. Please adjust the dev_split parameter.")
|
||||
|
||||
logger.info(
|
||||
f"Took {len(dev_dataset)} samples out of train set to create dev set (dev split is roughly {self.processor.dev_split})"
|
||||
"Took %s samples out of train set to create dev set (dev split is roughly %s)",
|
||||
len(dev_dataset),
|
||||
self.processor.dev_split,
|
||||
)
|
||||
|
||||
def random_split_ConcatDataset(self, ds: ConcatDataset, lengths: List[int]):
|
||||
@ -387,7 +389,7 @@ class DataSilo:
|
||||
clipped, ave_len, seq_lens, max_seq_len = self._calc_length_stats_biencoder()
|
||||
else:
|
||||
logger.warning(
|
||||
f"Could not compute length statistics because 'input_ids' or 'query_input_ids' and 'passage_input_ids' are missing."
|
||||
"Could not compute length statistics because 'input_ids' or 'query_input_ids' and 'passage_input_ids' are missing."
|
||||
)
|
||||
clipped = -1
|
||||
ave_len = -1
|
||||
@ -416,11 +418,14 @@ class DataSilo:
|
||||
logger.info("Proportion clipped: {}".format(clipped))
|
||||
if clipped > 0.5:
|
||||
logger.info(
|
||||
f"[Haystack Tip] {round(clipped * 100, 1)}% of your samples got cut down to {max_seq_len} tokens. "
|
||||
"[Haystack Tip] %s%% of your samples got cut down to %s tokens. "
|
||||
"Consider increasing max_seq_len "
|
||||
f"(the maximum value allowed with the current model is max_seq_len={self.processor.tokenizer.model_max_length}, "
|
||||
"(the maximum value allowed with the current model is max_seq_len=%s, "
|
||||
"if this is not enough consider splitting the document in smaller units or changing the model). "
|
||||
"This will lead to higher memory consumption but is likely to improve your model performance"
|
||||
"This will lead to higher memory consumption but is likely to improve your model performance",
|
||||
round(clipped * 100, 1),
|
||||
max_seq_len,
|
||||
self.processor.tokenizer.model_max_length,
|
||||
)
|
||||
elif "query_input_ids" in self.tensor_names and "passage_input_ids" in self.tensor_names:
|
||||
logger.info(
|
||||
|
@ -59,9 +59,11 @@ def convert_features_to_dataset(features):
|
||||
base = check.ravel()[0]
|
||||
if not np.issubdtype(type(base), np.integer):
|
||||
logger.warning(
|
||||
f"Problem during conversion to torch tensors:\n"
|
||||
f"A non-integer value for feature '{t_name}' with a value of: "
|
||||
f"'{base}' will be converted to a torch tensor of dtype long."
|
||||
"Problem during conversion to torch tensors:\n"
|
||||
"A non-integer value for feature '%s' with a value of: "
|
||||
"'%s' will be converted to a torch tensor of dtype long.",
|
||||
t_name,
|
||||
base,
|
||||
)
|
||||
except:
|
||||
logger.debug(
|
||||
|
@ -38,10 +38,11 @@ def sample_to_features_text(sample, tasks, max_seq_len, tokenizer):
|
||||
|
||||
if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]):
|
||||
logger.error(
|
||||
f"FastTokenizer encoded sample {sample.clear_text['text']} to "
|
||||
f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs "
|
||||
f"from number of tokens produced in tokenize_with_metadata(). \n"
|
||||
f"Further processing is likely to be wrong."
|
||||
"FastTokenizer encoded sample %s to %s tokens, which differs "
|
||||
"from number of tokens produced in tokenize_with_metadata(). \n"
|
||||
"Further processing is likely to be wrong.",
|
||||
sample.clear_text["text"],
|
||||
len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1),
|
||||
)
|
||||
else:
|
||||
# TODO It might be cleaner to adjust the data structure in sample.tokenized
|
||||
|
@ -565,8 +565,9 @@ class SquadProcessor(Processor):
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Could not devide document into passages. Document: {basket.raw['document_text'][:200]}\n"
|
||||
f"With error: {e}"
|
||||
"Could not devide document into passages. Document: %s\nWith error: %s",
|
||||
basket.raw["document_text"][:200],
|
||||
e,
|
||||
)
|
||||
passage_spans = []
|
||||
|
||||
@ -663,8 +664,9 @@ class SquadProcessor(Processor):
|
||||
# check if answer string can be found in context
|
||||
if answer_text not in doc_text:
|
||||
logger.warning(
|
||||
f"Answer '{answer['text']}' not contained in context.\n"
|
||||
f"Example will not be converted for training/evaluation."
|
||||
"Answer '%s' not contained in context.\n"
|
||||
"Example will not be converted for training/evaluation.",
|
||||
answer["text"],
|
||||
)
|
||||
error_in_answer = True
|
||||
label_idxs[i][0] = -100 # TODO remove this hack also from featurization
|
||||
@ -672,8 +674,10 @@ class SquadProcessor(Processor):
|
||||
break # Break loop around answers, so the error message is not shown multiple times
|
||||
if answer_indices.strip() != answer_text.strip():
|
||||
logger.warning(
|
||||
f"Answer using start/end indices is '{answer_indices}' while gold label text is '{answer_text}'.\n"
|
||||
f"Example will not be converted for training/evaluation."
|
||||
"Answer using start/end indices is '%s' while gold label text is '%s'.\n"
|
||||
"Example will not be converted for training/evaluation.",
|
||||
answer_indices,
|
||||
answer_text,
|
||||
)
|
||||
error_in_answer = True
|
||||
label_idxs[i][0] = -100 # TODO remove this hack also from featurization
|
||||
@ -1025,7 +1029,7 @@ class TextSimilarityProcessor(Processor):
|
||||
|
||||
if problematic_ids:
|
||||
logger.error(
|
||||
f"There were {len(problematic_ids)} errors during preprocessing at positions: {problematic_ids}"
|
||||
"There were %s errors during preprocessing at positions: %s", len(problematic_ids), problematic_ids
|
||||
)
|
||||
|
||||
if return_baskets:
|
||||
@ -1104,7 +1108,7 @@ class TextSimilarityProcessor(Processor):
|
||||
|
||||
if len(tokenized_query) == 0:
|
||||
logger.warning(
|
||||
f"The query could not be tokenized, likely because it contains a character that the query tokenizer does not recognize"
|
||||
"The query could not be tokenized, likely because it contains a character that the query tokenizer does not recognize"
|
||||
)
|
||||
return None
|
||||
|
||||
@ -1222,7 +1226,8 @@ class TextSimilarityProcessor(Processor):
|
||||
if title is None:
|
||||
title = ""
|
||||
logger.warning(
|
||||
f"Couldn't find title although `embed_title` is set to True for DPR. Using title='' now. Related passage text: '{ctx}' "
|
||||
"Couldn't find title although `embed_title` is set to True for DPR. Using title='' now. Related passage text: '%s' ",
|
||||
ctx,
|
||||
)
|
||||
res.append(tuple((title, ctx)))
|
||||
return res
|
||||
@ -1545,7 +1550,7 @@ class TableTextSimilarityProcessor(Processor):
|
||||
|
||||
if problematic_ids:
|
||||
logger.error(
|
||||
f"There were {len(problematic_ids)} errors during preprocessing at positions: {problematic_ids}"
|
||||
"There were %s errors during preprocessing at positions: %s", len(problematic_ids), problematic_ids
|
||||
)
|
||||
|
||||
if return_baskets:
|
||||
@ -1588,7 +1593,7 @@ class TableTextSimilarityProcessor(Processor):
|
||||
|
||||
if len(tokenized_query) == 0:
|
||||
logger.warning(
|
||||
f"The query could not be tokenized, likely because it contains a character that the query tokenizer does not recognize"
|
||||
"The query could not be tokenized, likely because it contains a character that the query tokenizer does not recognize"
|
||||
)
|
||||
return None
|
||||
|
||||
|
@ -125,7 +125,8 @@ class Evaluator:
|
||||
temperature_change = (abs(temperature_current - temperature_previous) / temperature_previous) * 100.0
|
||||
if temperature_change > 50:
|
||||
logger.warning(
|
||||
f"temperature used for calibration of confidence scores changed by more than {temperature_change} percent"
|
||||
"temperature used for calibration of confidence scores changed by more than %s percent",
|
||||
temperature_change,
|
||||
)
|
||||
if hasattr(head, "aggregate_preds"):
|
||||
# Needed to convert NQ ids from np arrays to strings
|
||||
@ -146,8 +147,11 @@ class Evaluator:
|
||||
result["report"] = compute_report_metrics(head, preds_all[head_num], label_all[head_num])
|
||||
except:
|
||||
logger.error(
|
||||
f"Couldn't create eval report for head {head_num} with following preds and labels:"
|
||||
f"\n Preds: {preds_all[head_num]} \n Labels: {label_all[head_num]}"
|
||||
"Couldn't create eval report for head %s with following preds and labels:"
|
||||
"\n Preds: %s \n Labels: %s",
|
||||
head_num,
|
||||
preds_all[head_num],
|
||||
label_all[head_num],
|
||||
)
|
||||
result["report"] = "Error"
|
||||
|
||||
|
@ -77,8 +77,9 @@ class Inferencer:
|
||||
self.devices, n_gpu = initialize_device_settings(devices=devices, use_cuda=gpu, multi_gpu=False)
|
||||
if len(self.devices) > 1:
|
||||
logger.warning(
|
||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
||||
f"using the first device {self.devices[0]}."
|
||||
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||
self.__class__.__name__,
|
||||
self.devices[0],
|
||||
)
|
||||
|
||||
self.processor = processor
|
||||
@ -187,9 +188,7 @@ class Inferencer:
|
||||
|
||||
devices, n_gpu = initialize_device_settings(devices=devices, use_cuda=gpu, multi_gpu=False)
|
||||
if len(devices) > 1:
|
||||
logger.warning(
|
||||
f"Multiple devices are not supported in Inferencer, " f"using the first device {devices[0]}."
|
||||
)
|
||||
logger.warning("Multiple devices are not supported in Inferencer, using the first device %s.", devices[0])
|
||||
|
||||
name = os.path.basename(model_name_or_path)
|
||||
|
||||
|
@ -390,8 +390,9 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
|
||||
for prediction_head in self.prediction_heads:
|
||||
if len(prediction_head.layer_dims) != 2:
|
||||
logger.error(
|
||||
f"Currently conversion only works for PredictionHeads that are a single layer Feed Forward NN with dimensions [LM_output_dim, number_classes].\n"
|
||||
f" Your PredictionHead has {str(prediction_head.layer_dims)} dimensions."
|
||||
"Currently conversion only works for PredictionHeads that are a single layer Feed Forward NN with dimensions [LM_output_dim, number_classes].\n"
|
||||
" Your PredictionHead has %s dimensions.",
|
||||
str(prediction_head.layer_dims),
|
||||
)
|
||||
continue
|
||||
if prediction_head.model_type == "span_classification":
|
||||
@ -399,8 +400,8 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
|
||||
converted_models.append(transformers_model)
|
||||
else:
|
||||
logger.error(
|
||||
f"Haystack -> Transformers conversion is not supported yet for"
|
||||
f" prediction heads of type {prediction_head.model_type}"
|
||||
"Haystack -> Transformers conversion is not supported yet for prediction heads of type %s",
|
||||
prediction_head.model_type,
|
||||
)
|
||||
|
||||
return converted_models
|
||||
|
@ -93,7 +93,7 @@ class FeatureExtractor:
|
||||
with open(config_file) as f:
|
||||
config = json.load(f)
|
||||
feature_extractor_classname = config["tokenizer_class"]
|
||||
logger.debug(f"⛏️ Selected feature extractor: {feature_extractor_classname} (from {config_file})")
|
||||
logger.debug("⛏️ Selected feature extractor: %s (from %s)", feature_extractor_classname, config_file)
|
||||
# Use FastTokenizers as much as possible
|
||||
try:
|
||||
feature_extractor_class = getattr(transformers, feature_extractor_classname + "Fast")
|
||||
@ -122,7 +122,7 @@ class FeatureExtractor:
|
||||
f"\n- {f'{chr(10)}- '.join(FEATURE_EXTRACTORS.keys())}"
|
||||
) from e
|
||||
logger.debug(
|
||||
f"⛏️ Selected feature extractor: {feature_extractor_class.__name__} (for model type '{model_type}')"
|
||||
"⛏️ Selected feature extractor: %s (for model type '%s')", feature_extractor_class.__name__, model_type
|
||||
)
|
||||
|
||||
self.default_params = DEFAULT_EXTRACTION_PARAMS.get(feature_extractor_class, {})
|
||||
|
@ -293,7 +293,7 @@ class HFLanguageModel(LanguageModel):
|
||||
model_emb_size = self.model.resize_token_embeddings(new_num_tokens=None).num_embeddings
|
||||
vocab_size = model_emb_size + n_added_tokens
|
||||
logger.info(
|
||||
f"Resizing embedding layer of LM from {model_emb_size} to {vocab_size} to cope with custom vocab."
|
||||
"Resizing embedding layer of LM from %s to %s to cope with custom vocab.", model_emb_size, vocab_size
|
||||
)
|
||||
self.model.resize_token_embeddings(vocab_size)
|
||||
# verify
|
||||
@ -464,7 +464,7 @@ class HFLanguageModelNoSegmentIds(HFLanguageModelWithPooler):
|
||||
specified using the arguments `output_hidden_states` and `output_attentions`.
|
||||
"""
|
||||
if segment_ids is not None:
|
||||
logger.warning(f"'segment_ids' is not None, but %s does not use them. They will be ignored.", self.name)
|
||||
logger.warning("'segment_ids' is not None, but %s does not use them. They will be ignored.", self.name)
|
||||
|
||||
return super().forward(
|
||||
input_ids=input_ids,
|
||||
@ -636,8 +636,9 @@ class DPREncoder(LanguageModel):
|
||||
"""
|
||||
if model_config.model_type.lower() != "bert":
|
||||
logger.warning(
|
||||
f"Using a model of type '{model_config.model_type}' which might be incompatible with DPR encoders. "
|
||||
f"Only Bert-based encoders are supported. They need input_ids, token_type_ids, attention_mask as input tensors."
|
||||
"Using a model of type '%s' which might be incompatible with DPR encoders. "
|
||||
"Only Bert-based encoders are supported. They need input_ids, token_type_ids, attention_mask as input tensors.",
|
||||
model_config.model_type,
|
||||
)
|
||||
config_dict = vars(model_config)
|
||||
if model_kwargs:
|
||||
@ -876,12 +877,13 @@ def get_language_model(
|
||||
|
||||
if not model_type:
|
||||
logger.error(
|
||||
f"Model type not understood for '{pretrained_model_name_or_path}' "
|
||||
f"({model_type if model_type else 'model_type not set'}). "
|
||||
"Model type not understood for '%s' (%s). "
|
||||
"Either supply the local path for a saved model, "
|
||||
"or the name of a model that can be downloaded from the Model Hub. "
|
||||
"Ensure that the model class name can be inferred from the directory name "
|
||||
"when loading a Transformers model."
|
||||
"when loading a Transformers model.",
|
||||
pretrained_model_name_or_path,
|
||||
model_type if model_type else "model_type not set",
|
||||
)
|
||||
logger.error("Using the AutoModel class for '%s'. This can cause crashes!", pretrained_model_name_or_path)
|
||||
model_type = "Auto"
|
||||
@ -957,7 +959,7 @@ def _get_model_type(
|
||||
|
||||
if model_type and model_type.lower() == "roberta" and "mlm" in model_name_or_path.lower():
|
||||
logger.error(
|
||||
f"MLM part of codebert is currently not supported in Haystack: '{model_name_or_path}' may crash later."
|
||||
"MLM part of codebert is currently not supported in Haystack: '%s' may crash later.", model_name_or_path
|
||||
)
|
||||
|
||||
return model_type
|
||||
|
@ -88,13 +88,14 @@ def get_model(
|
||||
config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name, **autoconfig_kwargs)
|
||||
model_type = config.model_type
|
||||
except Exception as e:
|
||||
logger.debug(f"Can't find model type for {pretrained_model_name_or_path}: {e}")
|
||||
logger.debug("Can't find model type for %s: %s", pretrained_model_name_or_path, e)
|
||||
|
||||
if feature_extractor_kwargs is not None:
|
||||
logger.debug(
|
||||
"Can't forward feature_extractor_kwargs to a SentenceTransformers model. "
|
||||
"These kwargs are being dropped. "
|
||||
f"Content of feature_extractor_kwargs: {feature_extractor_kwargs}"
|
||||
"Content of feature_extractor_kwargs: %s",
|
||||
feature_extractor_kwargs,
|
||||
)
|
||||
|
||||
else:
|
||||
@ -102,9 +103,10 @@ def get_model(
|
||||
config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name, **autoconfig_kwargs)
|
||||
if not config.model_type:
|
||||
logger.error(
|
||||
f"Model type not understood for '{pretrained_model_name_or_path}'. Please provide the name of "
|
||||
"Model type not understood for '%s'. Please provide the name of "
|
||||
"a model that can be downloaded from the Model Hub.\nUsing the AutoModel class. "
|
||||
"THIS CAN CAUSE CRASHES and won't work for models that are not working with text."
|
||||
"THIS CAN CAUSE CRASHES and won't work for models that are not working with text.",
|
||||
pretrained_model_name_or_path,
|
||||
)
|
||||
model_type = None
|
||||
else:
|
||||
@ -112,10 +114,13 @@ def get_model(
|
||||
model_type = HUGGINGFACE_CAPITALIZE[config.model_type.lower()]
|
||||
except KeyError as e:
|
||||
logger.error(
|
||||
f"Haystack doesn't support model '{pretrained_model_name_or_path}' (type '{config.model_type.lower()}') "
|
||||
"Haystack doesn't support model '%s' (type '%s') "
|
||||
"We'll use the AutoModel class for it. "
|
||||
"THIS CAN CAUSE CRASHES and won't work for models that are not working with text. "
|
||||
f"Supported model types: {', '.join(HUGGINGFACE_CAPITALIZE.keys())}"
|
||||
"Supported model types: %s",
|
||||
pretrained_model_name_or_path,
|
||||
config.model_type.lower(),
|
||||
", ".join(HUGGINGFACE_CAPITALIZE.keys()),
|
||||
)
|
||||
model_type = None
|
||||
|
||||
|
@ -25,9 +25,11 @@ class HaystackModel(ABC):
|
||||
See the values of `haystack.schema.ContentTypes`.
|
||||
"""
|
||||
logger.info(
|
||||
f" 🤖 Loading '{pretrained_model_name_or_path}' "
|
||||
f"({self.__class__.__name__} of type '{model_type if model_type else '<unknown>'}' "
|
||||
f"for {content_type} data)"
|
||||
" 🤖 Loading '%s' (%s of type '%s' for %s data)",
|
||||
pretrained_model_name_or_path,
|
||||
self.__class__.__name__,
|
||||
model_type if model_type else "<unknown>",
|
||||
content_type,
|
||||
)
|
||||
self.model_name_or_path = pretrained_model_name_or_path
|
||||
self.model_type = model_type
|
||||
|
@ -164,8 +164,11 @@ class PredictionHead(nn.Module):
|
||||
return
|
||||
new_dims = [input_dim] + old_dims[1:]
|
||||
logger.info(
|
||||
f"Resizing input dimensions of {type(self).__name__} ({self.task_name}) "
|
||||
f"from {old_dims} to {new_dims} to match language model"
|
||||
"Resizing input dimensions of %s (%s) from %s to %s to match language model",
|
||||
type(self).__name__,
|
||||
self.task_name,
|
||||
old_dims,
|
||||
new_dims,
|
||||
)
|
||||
self.feed_forward = FeedForwardBlock(new_dims)
|
||||
self.layer_dims[0] = input_dim
|
||||
@ -260,8 +263,8 @@ class QuestionAnsweringHead(PredictionHead):
|
||||
super(QuestionAnsweringHead, self).__init__()
|
||||
if len(kwargs) > 0:
|
||||
logger.warning(
|
||||
f"Some unused parameters are passed to the QuestionAnsweringHead. "
|
||||
f"Might not be a problem. Params: {json.dumps(kwargs)}"
|
||||
"Some unused parameters are passed to the QuestionAnsweringHead. Might not be a problem. Params: %s",
|
||||
json.dumps(kwargs),
|
||||
)
|
||||
self.layer_dims = layer_dims
|
||||
assert self.layer_dims[-1] == 2
|
||||
|
@ -105,20 +105,23 @@ class QACandidate:
|
||||
self.answer = "no_answer"
|
||||
if self.offset_answer_start != 0 or self.offset_answer_end != 0:
|
||||
logger.error(
|
||||
f"Both start and end offsets should be 0: \n"
|
||||
f"{self.offset_answer_start}, {self.offset_answer_end} with a no_answer. "
|
||||
"Both start and end offsets should be 0: \n%s, %s with a no_answer. ",
|
||||
self.offset_answer_start,
|
||||
self.offset_answer_end,
|
||||
)
|
||||
else:
|
||||
self.answer = string
|
||||
if self.offset_answer_end - self.offset_answer_start <= 0:
|
||||
logger.error(
|
||||
f"End offset comes before start offset: \n"
|
||||
f"({self.offset_answer_start}, {self.offset_answer_end}) with a span answer. "
|
||||
"End offset comes before start offset: \n(%s, %s) with a span answer. ",
|
||||
self.offset_answer_start,
|
||||
self.offset_answer_end,
|
||||
)
|
||||
elif self.offset_answer_end <= 0:
|
||||
logger.error(
|
||||
f"Invalid end offset: \n"
|
||||
f"({self.offset_answer_start}, {self.offset_answer_end}) with a span answer. "
|
||||
"Invalid end offset: \n(%s, %s) with a span answer. ",
|
||||
self.offset_answer_start,
|
||||
self.offset_answer_end,
|
||||
)
|
||||
|
||||
def _create_context_window(self, context_window_size: int, clear_text: str) -> Tuple[str, int, int]:
|
||||
@ -167,7 +170,8 @@ class QACandidate:
|
||||
"""
|
||||
if self.offset_unit != "token":
|
||||
logger.error(
|
||||
f"QACandidate needs to have self.offset_unit=token before calling _span_to_string() (id = {self.passage_id})"
|
||||
"QACandidate needs to have self.offset_unit=token before calling _span_to_string() (id = %s)",
|
||||
self.passage_id,
|
||||
)
|
||||
|
||||
start_t = self.offset_answer_start
|
||||
|
@ -104,8 +104,10 @@ class Trainer:
|
||||
if use_amp in amp_mapping:
|
||||
logger.warning(
|
||||
"The Trainer only supports native PyTorch automatic mixed precision and no longer supports the Apex library.\n"
|
||||
f"Because you provided Apex optimization level {use_amp}, automatic mixed precision was set to {amp_mapping[use_amp]}.\n"
|
||||
"In the future, set `use_amp=True` to turn on automatic mixed precision."
|
||||
"Because you provided Apex optimization level %s, automatic mixed precision was set to %s.\n"
|
||||
"In the future, set `use_amp=True` to turn on automatic mixed precision.",
|
||||
use_amp,
|
||||
amp_mapping[use_amp],
|
||||
)
|
||||
use_amp = amp_mapping[use_amp]
|
||||
else:
|
||||
@ -570,8 +572,11 @@ class Trainer:
|
||||
if ranks_with_data < torch.distributed.get_world_size():
|
||||
if step is not None:
|
||||
logger.info(
|
||||
f"Stopping epoch {self.from_epoch} at step {step} for rank {self.local_rank} since at least one other rank "
|
||||
f"(~ one GPU) in distributed training doesn't have any more batches... "
|
||||
"Stopping epoch %s at step %s for rank %s since at least one other rank "
|
||||
"(~ one GPU) in distributed training doesn't have any more batches... ",
|
||||
self.from_epoch,
|
||||
step,
|
||||
self.local_rank,
|
||||
)
|
||||
return False
|
||||
else:
|
||||
|
@ -191,13 +191,15 @@ class OpenAIAnswerGenerator(BaseGenerator):
|
||||
|
||||
if len(input_docs) == 0:
|
||||
logger.warning(
|
||||
f"Skipping all of the provided Documents, as none of them fits the maximum token limit of "
|
||||
f"{self.MAX_TOKENS_LIMIT}. The generated answers will therefore not be conditioned on any context."
|
||||
"Skipping all of the provided Documents, as none of them fits the maximum token limit of %s"
|
||||
"The generated answers will therefore not be conditioned on any context.",
|
||||
self.MAX_TOKENS_LIMIT,
|
||||
)
|
||||
elif skipped_docs >= 1:
|
||||
logger.warning(
|
||||
f"Skipping {skipped_docs} of the provided Documents, as using them would exceed the maximum token "
|
||||
f"limit of {self.MAX_TOKENS_LIMIT}."
|
||||
"Skipping %s of the provided Documents, as using them would exceed the maximum token limit of %s.",
|
||||
skipped_docs,
|
||||
self.MAX_TOKENS_LIMIT,
|
||||
)
|
||||
|
||||
# Top ranked documents should go at the end
|
||||
|
@ -131,8 +131,9 @@ class RAGenerator(BaseGenerator):
|
||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||
if len(self.devices) > 1:
|
||||
logger.warning(
|
||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
||||
f"using the first device {self.devices[0]}."
|
||||
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||
self.__class__.__name__,
|
||||
self.devices[0],
|
||||
)
|
||||
|
||||
self.tokenizer = RagTokenizer.from_pretrained(model_name_or_path, use_auth_token=use_auth_token)
|
||||
@ -389,8 +390,9 @@ class Seq2SeqGenerator(BaseGenerator):
|
||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||
if len(self.devices) > 1:
|
||||
logger.warning(
|
||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
||||
f"using the first device {self.devices[0]}."
|
||||
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||
self.__class__.__name__,
|
||||
self.devices[0],
|
||||
)
|
||||
|
||||
Seq2SeqGenerator._register_converters(model_name_or_path, input_converter)
|
||||
|
@ -52,8 +52,9 @@ class TextToSpeech:
|
||||
resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||
if len(resolved_devices) > 1:
|
||||
logger.warning(
|
||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
||||
f"using the first device {resolved_devices[0]}."
|
||||
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||
self.__class__.__name__,
|
||||
resolved_devices[0],
|
||||
)
|
||||
|
||||
self.model = _Text2SpeechModel.from_pretrained(
|
||||
|
@ -311,7 +311,12 @@ class Crawler(BaseComponent):
|
||||
json.dump(document.to_dict(), f)
|
||||
except Exception as e:
|
||||
logging.exception(
|
||||
f"Crawler can't save the content of '{link}' under '{file_path}'. This webpage will be skipped, but links from this page will still be crawled. Make sure the path above is accessible and the file name is valid. If the file name is invalid, consider setting 'crawler_naming_function' to another function."
|
||||
"Crawler can't save the content of '%s' under '%s'. "
|
||||
"This webpage will be skipped, but links from this page will still be crawled. "
|
||||
"Make sure the path above is accessible and the file name is valid. "
|
||||
"If the file name is invalid, consider setting 'crawler_naming_function' to another function.",
|
||||
link,
|
||||
file_path,
|
||||
)
|
||||
|
||||
paths.append(file_path)
|
||||
|
@ -123,15 +123,17 @@ class TransformersDocumentClassifier(BaseDocumentClassifier):
|
||||
|
||||
if labels and task == "text-classification":
|
||||
logger.warning(
|
||||
f"Provided labels {labels} will be ignored for task text-classification. Set task to "
|
||||
f"zero-shot-classification to use labels."
|
||||
"Provided labels %s will be ignored for task text-classification. Set task to "
|
||||
"zero-shot-classification to use labels.",
|
||||
labels,
|
||||
)
|
||||
|
||||
resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||
if len(resolved_devices) > 1:
|
||||
logger.warning(
|
||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
||||
f"using the first device {resolved_devices[0]}."
|
||||
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||
self.__class__.__name__,
|
||||
resolved_devices[0],
|
||||
)
|
||||
|
||||
if tokenizer is None:
|
||||
|
@ -75,16 +75,18 @@ class EvalDocuments(BaseComponent):
|
||||
self.top_k_used = top_k
|
||||
elif self.top_k_used != top_k:
|
||||
logger.warning(
|
||||
f"EvalDocuments was last run with top_k_eval_documents={self.top_k_used} but is "
|
||||
f"being run again with top_k={self.top_k}. "
|
||||
f"The evaluation counter is being reset from this point so that the evaluation "
|
||||
f"metrics are interpretable."
|
||||
"EvalDocuments was last run with top_k_eval_documents=%s} but is "
|
||||
"being run again with top_k=%s. "
|
||||
"The evaluation counter is being reset from this point so that the evaluation "
|
||||
"metrics are interpretable.",
|
||||
self.top_k_used,
|
||||
self.top_k,
|
||||
)
|
||||
self.init_counts()
|
||||
|
||||
if len(documents) < top_k and not self.too_few_docs_warning:
|
||||
logger.warning(
|
||||
f"EvalDocuments is being provided less candidate documents than top_k " f"(currently set to {top_k})."
|
||||
"EvalDocuments is being provided less candidate documents than top_k (currently set to %s).", top_k
|
||||
)
|
||||
self.too_few_docs_warning = True
|
||||
|
||||
|
@ -123,8 +123,9 @@ class EntityExtractor(BaseComponent):
|
||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||
if len(self.devices) > 1:
|
||||
logger.warning(
|
||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
||||
f"using the first device {self.devices[0]}."
|
||||
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||
self.__class__.__name__,
|
||||
self.devices[0],
|
||||
)
|
||||
self.batch_size = batch_size
|
||||
self.progress_bar = progress_bar
|
||||
|
@ -63,8 +63,9 @@ class FileTypeClassifier(BaseComponent):
|
||||
return mimetypes.guess_extension(extension) or ""
|
||||
except NameError as ne:
|
||||
logger.error(
|
||||
f"The type of '{file_path}' could not be guessed, probably because 'python-magic' is not installed. Ignoring this error."
|
||||
"Please make sure the necessary OS libraries are installed if you need this functionality ('python-magic' or 'python-magic-bin' on Windows)."
|
||||
"The type of '%s' could not be guessed, probably because 'python-magic' is not installed. Ignoring this error."
|
||||
"Please make sure the necessary OS libraries are installed if you need this functionality ('python-magic' or 'python-magic-bin' on Windows).",
|
||||
file_path,
|
||||
)
|
||||
return ""
|
||||
|
||||
|
@ -201,8 +201,10 @@ class AzureConverter(BaseConverter):
|
||||
file_text += f" {cell}"
|
||||
if not self.validate_language(file_text, valid_languages):
|
||||
logger.warning(
|
||||
f"The language for {file_path} is not one of {valid_languages}. The file may not have "
|
||||
f"been decoded in the correct text format."
|
||||
"The language for %s is not one of %s. The file may not have "
|
||||
"been decoded in the correct text format.",
|
||||
file_path,
|
||||
valid_languages,
|
||||
)
|
||||
|
||||
return docs
|
||||
|
@ -146,8 +146,9 @@ class ImageToTextConverter(BaseConverter):
|
||||
document_text = "".join(cleaned_pages)
|
||||
if not self.validate_language(document_text, valid_languages):
|
||||
logger.warning(
|
||||
f"The language for image is not one of {valid_languages}. The file may not have "
|
||||
f"been decoded in the correct text format."
|
||||
"The language for image is not one of %s. The file may not have "
|
||||
"been decoded in the correct text format.",
|
||||
valid_languages,
|
||||
)
|
||||
|
||||
text = "\f".join(cleaned_pages)
|
||||
|
@ -200,8 +200,10 @@ class ParsrConverter(BaseConverter):
|
||||
file_text += f" {cell}"
|
||||
if not self.validate_language(file_text, valid_languages):
|
||||
logger.warning(
|
||||
f"The language for {file_path} is not one of {valid_languages}. The file may not have "
|
||||
f"been decoded in the correct text format."
|
||||
"The language for %s is not one of %s. The file may not have "
|
||||
"been decoded in the correct text format.",
|
||||
file_path,
|
||||
valid_languages,
|
||||
)
|
||||
|
||||
if extract_headlines:
|
||||
|
@ -150,8 +150,10 @@ class PDFToTextConverter(BaseConverter):
|
||||
document_text = "".join(cleaned_pages)
|
||||
if not self.validate_language(document_text, valid_languages):
|
||||
logger.warning(
|
||||
f"The language for {file_path} is not one of {valid_languages}. The file may not have "
|
||||
f"been decoded in the correct text format."
|
||||
"The language for %s is not one of %s. The file may not have "
|
||||
"been decoded in the correct text format.",
|
||||
file_path,
|
||||
valid_languages,
|
||||
)
|
||||
|
||||
text = "\f".join(cleaned_pages)
|
||||
|
@ -172,8 +172,10 @@ class TikaConverter(BaseConverter):
|
||||
document_text = "".join(cleaned_pages)
|
||||
if not self.validate_language(document_text, valid_languages):
|
||||
logger.warning(
|
||||
f"The language for {file_path} is not one of {valid_languages}. The file may not have "
|
||||
f"been decoded in the correct text format."
|
||||
"The language for %s is not one of %s. The file may not have "
|
||||
"been decoded in the correct text format.",
|
||||
file_path,
|
||||
valid_languages,
|
||||
)
|
||||
|
||||
text = "\f".join(cleaned_pages)
|
||||
|
@ -75,8 +75,10 @@ class TextConverter(BaseConverter):
|
||||
document_text = "".join(cleaned_pages)
|
||||
if not self.validate_language(document_text, valid_languages):
|
||||
logger.warning(
|
||||
f"The language for {file_path} is not one of {valid_languages}. The file may not have "
|
||||
f"been decoded in the correct text format."
|
||||
"The language for %s is not one of %s. The file may not have "
|
||||
"been decoded in the correct text format.",
|
||||
file_path,
|
||||
valid_languages,
|
||||
)
|
||||
|
||||
text = "".join(cleaned_pages)
|
||||
|
@ -119,8 +119,9 @@ class PseudoLabelGenerator(BaseComponent):
|
||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||
if len(self.devices) > 1:
|
||||
logger.warning(
|
||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
||||
f"using the first device {self.devices[0]}."
|
||||
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||
self.__class__.__name__,
|
||||
self.devices[0],
|
||||
)
|
||||
|
||||
self.retriever = retriever
|
||||
|
@ -106,7 +106,7 @@ class PreProcessor(BasePreProcessor):
|
||||
try:
|
||||
nltk.download("punkt")
|
||||
except FileExistsError as error:
|
||||
logger.debug(f"NLTK punkt tokenizer seems to be already downloaded. Error message: {error}")
|
||||
logger.debug("NLTK punkt tokenizer seems to be already downloaded. Error message: %s", error)
|
||||
pass
|
||||
self.clean_whitespace = clean_whitespace
|
||||
self.clean_header_footer = clean_header_footer
|
||||
@ -747,14 +747,16 @@ class PreProcessor(BasePreProcessor):
|
||||
# NLTK failed to load custom SentenceTokenizer, fallback to the default model or to English
|
||||
if language_name is not None:
|
||||
logger.error(
|
||||
f"PreProcessor couldn't find custom sentence tokenizer model for {self.language}. "
|
||||
f"Using default {self.language} model."
|
||||
"PreProcessor couldn't find custom sentence tokenizer model for %s. Using default %s model.",
|
||||
self.language,
|
||||
self.language,
|
||||
)
|
||||
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/{language_name}.pickle")
|
||||
else:
|
||||
logger.error(
|
||||
f"PreProcessor couldn't find default or custom sentence tokenizer model for {self.language}. "
|
||||
f"Using English instead."
|
||||
"PreProcessor couldn't find default or custom sentence tokenizer model for %s. "
|
||||
"Using English instead.",
|
||||
self.language,
|
||||
)
|
||||
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/english.pickle")
|
||||
|
||||
@ -763,8 +765,9 @@ class PreProcessor(BasePreProcessor):
|
||||
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/{language_name}.pickle")
|
||||
else:
|
||||
logger.error(
|
||||
f"PreProcessor couldn't find the default sentence tokenizer model for {self.language}. "
|
||||
f" Using English instead. You may train your own model and use the 'tokenizer_model_folder' parameter."
|
||||
"PreProcessor couldn't find the default sentence tokenizer model for %s. "
|
||||
" Using English instead. You may train your own model and use the 'tokenizer_model_folder' parameter.",
|
||||
self.language,
|
||||
)
|
||||
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/english.pickle")
|
||||
|
||||
|
@ -125,8 +125,11 @@ class PromptTemplate(BasePromptTemplate, ABC):
|
||||
if args:
|
||||
if len(args) != len(self.prompt_params):
|
||||
logger.warning(
|
||||
f"For {self.name}, expected {self.prompt_params} arguments, instead "
|
||||
f"got {len(args)} arguments {args}"
|
||||
"For %s, expected %s arguments, instead got %s arguments %s",
|
||||
self.name,
|
||||
self.prompt_params,
|
||||
len(args),
|
||||
args,
|
||||
)
|
||||
for prompt_param, arg in zip(self.prompt_params, args):
|
||||
template_dict[prompt_param] = [arg] if isinstance(arg, str) else arg
|
||||
@ -229,8 +232,9 @@ class HFLocalInvocationLayer(PromptModelInvocationLayer):
|
||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||
if len(self.devices) > 1:
|
||||
logger.warning(
|
||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
||||
f"using the first device {self.devices[0]}."
|
||||
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||
self.__class__.__name__,
|
||||
self.devices[0],
|
||||
)
|
||||
|
||||
# Due to reflective construction of all invocation layers we might receive some
|
||||
|
@ -100,8 +100,9 @@ class TransformersQueryClassifier(BaseQueryClassifier):
|
||||
resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||
if len(resolved_devices) > 1:
|
||||
logger.warning(
|
||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
||||
f"using the first device {resolved_devices[0]}."
|
||||
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||
self.__class__.__name__,
|
||||
resolved_devices[0],
|
||||
)
|
||||
|
||||
self.model = pipeline(
|
||||
|
@ -81,8 +81,9 @@ class QuestionGenerator(BaseComponent):
|
||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||
if len(self.devices) > 1:
|
||||
logger.warning(
|
||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
||||
f"using the first device {self.devices[0]}."
|
||||
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||
self.__class__.__name__,
|
||||
self.devices[0],
|
||||
)
|
||||
self.model = AutoModelForSeq2SeqLM.from_pretrained(
|
||||
model_name_or_path, revision=model_version, use_auth_token=use_auth_token
|
||||
|
@ -1023,9 +1023,10 @@ class FARMReader(BaseReader):
|
||||
|
||||
if self.top_k_per_candidate != 4:
|
||||
logger.info(
|
||||
f"Performing Evaluation using top_k_per_candidate = {self.top_k_per_candidate} \n"
|
||||
f"and consequently, QuestionAnsweringPredictionHead.n_best = {self.top_k_per_candidate + 1}. \n"
|
||||
f"This deviates from FARM's default where QuestionAnsweringPredictionHead.n_best = 5"
|
||||
"Performing Evaluation using top_k_per_candidate = %s \n"
|
||||
"and consequently, QuestionAnsweringPredictionHead.n_best = {self.top_k_per_candidate + 1}. \n"
|
||||
"This deviates from FARM's default where QuestionAnsweringPredictionHead.n_best = 5",
|
||||
self.top_k_per_candidate,
|
||||
)
|
||||
|
||||
# extract all questions for evaluation
|
||||
@ -1062,7 +1063,7 @@ class FARMReader(BaseReader):
|
||||
continue
|
||||
if label.answer.offsets_in_document is None:
|
||||
logger.error(
|
||||
f"Label.answer.offsets_in_document was None, but Span object was expected: {label} "
|
||||
"Label.answer.offsets_in_document was None, but Span object was expected: %s ", label
|
||||
)
|
||||
continue
|
||||
# add to existing answers
|
||||
@ -1074,7 +1075,11 @@ class FARMReader(BaseReader):
|
||||
# Hack to fix problem where duplicate questions are merged by doc_store processing creating a QA example with 8 annotations > 6 annotation max
|
||||
if len(aggregated_per_question[aggregation_key]["answers"]) >= 6:
|
||||
logger.warning(
|
||||
f"Answers in this sample are being dropped because it has more than 6 answers. (doc_id: {doc_id}, question: {label.query}, label_id: {label.id})"
|
||||
"Answers in this sample are being dropped because it has more than 6 answers. "
|
||||
"(doc_id: %s, question: %s, label_id: %s)",
|
||||
doc_id,
|
||||
label.query,
|
||||
label.id,
|
||||
)
|
||||
continue
|
||||
aggregated_per_question[aggregation_key]["answers"].append(
|
||||
|
@ -116,8 +116,9 @@ class TableReader(BaseReader):
|
||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||
if len(self.devices) > 1:
|
||||
logger.warning(
|
||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
||||
f"using the first device {self.devices[0]}."
|
||||
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||
self.__class__.__name__,
|
||||
self.devices[0],
|
||||
)
|
||||
|
||||
config = TapasConfig.from_pretrained(model_name_or_path, use_auth_token=use_auth_token)
|
||||
@ -646,8 +647,9 @@ class RCIReader(BaseReader):
|
||||
self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False)
|
||||
if len(self.devices) > 1:
|
||||
logger.warning(
|
||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
||||
f"using the first device {self.devices[0]}."
|
||||
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||
self.__class__.__name__,
|
||||
self.devices[0],
|
||||
)
|
||||
|
||||
self.row_model = AutoModelForSequenceClassification.from_pretrained(
|
||||
|
@ -86,8 +86,9 @@ class TransformersReader(BaseReader):
|
||||
|
||||
if len(self.devices) > 1:
|
||||
logger.warning(
|
||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
||||
f"using the first device {self.devices[0]}."
|
||||
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||
self.__class__.__name__,
|
||||
self.devices[0],
|
||||
)
|
||||
|
||||
self.model = pipeline(
|
||||
|
@ -104,14 +104,18 @@ class _BaseEmbeddingEncoder:
|
||||
|
||||
if model_similarity is not None and document_store.similarity != model_similarity:
|
||||
logger.warning(
|
||||
f"You seem to be using {model_name} model with the {document_store.similarity} function instead of the recommended {model_similarity}. "
|
||||
f"This can be set when initializing the DocumentStore"
|
||||
"You seem to be using %s model with the %s function instead of the recommended %s. "
|
||||
"This can be set when initializing the DocumentStore",
|
||||
model_name,
|
||||
document_store.similarity,
|
||||
model_similarity,
|
||||
)
|
||||
elif "dpr" in model_name.lower() and document_store.similarity != "dot_product":
|
||||
logger.warning(
|
||||
f"You seem to be using a DPR model with the {document_store.similarity} function. "
|
||||
f"We recommend using dot_product instead. "
|
||||
f"This can be set when initializing the DocumentStore"
|
||||
"You seem to be using a DPR model with the %s function. "
|
||||
"We recommend using dot_product instead. "
|
||||
"This can be set when initializing the DocumentStore",
|
||||
document_store.similarity,
|
||||
)
|
||||
|
||||
|
||||
|
@ -245,9 +245,8 @@ class BaseRetriever(BaseComponent):
|
||||
mean_avg_precision = summed_avg_precision / number_of_questions
|
||||
|
||||
logger.info(
|
||||
(
|
||||
f"For {correct_retrievals} out of {number_of_questions} questions ({recall:.2%}), the answer was in"
|
||||
f" the top-{top_k} candidate passages selected by the retriever."
|
||||
"For {} out of {} questions ({:.2%}), the answer was in the top-{} candidate passages selected by the retriever.".format(
|
||||
correct_retrievals, number_of_questions, recall, top_k
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -178,9 +178,10 @@ class DensePassageRetriever(DenseRetriever):
|
||||
|
||||
if document_store and document_store.similarity != "dot_product":
|
||||
logger.warning(
|
||||
f"You are using a Dense Passage Retriever model with the {document_store.similarity} function. "
|
||||
"You are using a Dense Passage Retriever model with the %s function. "
|
||||
"We recommend you use dot_product instead. "
|
||||
"This can be set when initializing the DocumentStore"
|
||||
"This can be set when initializing the DocumentStore",
|
||||
document_store.similarity,
|
||||
)
|
||||
|
||||
# Init & Load Encoders
|
||||
@ -550,8 +551,9 @@ class DensePassageRetriever(DenseRetriever):
|
||||
"""
|
||||
if self.processor.num_hard_negatives != 0:
|
||||
logger.warning(
|
||||
f"'num_hard_negatives' is set to {self.processor.num_hard_negatives}, but inference does "
|
||||
f"not require any hard negatives. Setting num_hard_negatives to 0."
|
||||
"'num_hard_negatives' is set to %s, but inference does "
|
||||
"not require any hard negatives. Setting num_hard_negatives to 0.",
|
||||
self.processor.num_hard_negatives,
|
||||
)
|
||||
self.processor.num_hard_negatives = 0
|
||||
|
||||
@ -1163,8 +1165,9 @@ class TableTextRetriever(DenseRetriever):
|
||||
|
||||
if self.processor.num_hard_negatives != 0:
|
||||
logger.warning(
|
||||
f"'num_hard_negatives' is set to {self.processor.num_hard_negatives}, but inference does "
|
||||
f"not require any hard negatives. Setting num_hard_negatives to 0."
|
||||
"'num_hard_negatives' is set to %s, but inference does "
|
||||
"not require any hard negatives. Setting num_hard_negatives to 0.",
|
||||
self.processor.num_hard_negatives,
|
||||
)
|
||||
self.processor.num_hard_negatives = 0
|
||||
|
||||
@ -1532,10 +1535,11 @@ class EmbeddingRetriever(DenseRetriever):
|
||||
and model_format != "sentence_transformers"
|
||||
):
|
||||
logger.warning(
|
||||
f"You seem to be using a Sentence Transformer embedding model but 'model_format' is set to '{self.model_format}'."
|
||||
f" You may need to set model_format='sentence_transformers' to ensure correct loading of model."
|
||||
f"As an alternative, you can let Haystack derive the format automatically by not setting the "
|
||||
f"'model_format' parameter at all."
|
||||
"You seem to be using a Sentence Transformer embedding model but 'model_format' is set to '%s'."
|
||||
" You may need to set model_format='sentence_transformers' to ensure correct loading of model."
|
||||
"As an alternative, you can let Haystack derive the format automatically by not setting the "
|
||||
"'model_format' parameter at all.",
|
||||
self.model_format,
|
||||
)
|
||||
|
||||
self.embedding_encoder = _EMBEDDING_ENCODERS[self.model_format](retriever=self)
|
||||
|
@ -96,8 +96,9 @@ class TransformersSummarizer(BaseSummarizer):
|
||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||
if len(self.devices) > 1:
|
||||
logger.warning(
|
||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
||||
f"using the first device {self.devices[0]}."
|
||||
"Multiple devices are not supported in %s} inference, using the first device %s.",
|
||||
self.__class__.__name__,
|
||||
self.devices[0],
|
||||
)
|
||||
|
||||
if tokenizer is None:
|
||||
|
@ -83,8 +83,9 @@ class TransformersTranslator(BaseTranslator):
|
||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||
if len(self.devices) > 1:
|
||||
logger.warning(
|
||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
||||
f"using the first device {self.devices[0]}."
|
||||
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||
self.__class__.__name__,
|
||||
self.devices[0],
|
||||
)
|
||||
|
||||
self.max_seq_len = max_seq_len
|
||||
|
@ -293,10 +293,13 @@ class Pipeline:
|
||||
for document_store in document_stores:
|
||||
if document_store["type"] != "DeepsetCloudDocumentStore":
|
||||
logger.info(
|
||||
f"In order to be used on Deepset Cloud, component '{document_store['name']}' of type '{document_store['type']}' "
|
||||
f"has been automatically converted to type DeepsetCloudDocumentStore. "
|
||||
f"Usually this replacement will result in equivalent pipeline quality. "
|
||||
f"However depending on chosen settings of '{document_store['name']}' differences might occur."
|
||||
"In order to be used on Deepset Cloud, component '%s' of type '%s' "
|
||||
"has been automatically converted to type DeepsetCloudDocumentStore. "
|
||||
"Usually this replacement will result in equivalent pipeline quality. "
|
||||
"However depending on chosen settings of '%s' differences might occur.",
|
||||
document_store["name"],
|
||||
document_store["type"],
|
||||
document_store["name"],
|
||||
)
|
||||
document_store["type"] = "DeepsetCloudDocumentStore"
|
||||
document_store["params"] = {}
|
||||
@ -784,7 +787,7 @@ class Pipeline:
|
||||
|
||||
# crop dataset if `dataset_size` is provided and is valid
|
||||
if num_documents is not None and 0 < num_documents < len(corpus):
|
||||
logger.info(f"Cropping dataset from {len(corpus)} to {num_documents} documents")
|
||||
logger.info("Cropping dataset from %s to %s documents", len(corpus), num_documents)
|
||||
corpus = dict(itertools.islice(corpus.items(), num_documents))
|
||||
# Remove queries that don't contain the remaining documents
|
||||
corpus_ids = set(list(corpus.keys()))
|
||||
@ -800,8 +803,9 @@ class Pipeline:
|
||||
qrels = qrels_new
|
||||
elif num_documents is not None and (num_documents < 1 or num_documents > len(corpus)):
|
||||
logging.warning(
|
||||
f"'num_documents' variable should be lower than corpus length and have a positive value, but it's {num_documents}."
|
||||
" Dataset size remains unchanged."
|
||||
"'num_documents' variable should be lower than corpus length and have a positive value, but it's %s."
|
||||
" Dataset size remains unchanged.",
|
||||
num_documents,
|
||||
)
|
||||
|
||||
# check index before eval
|
||||
|
@ -82,7 +82,11 @@ def get_component_definitions(
|
||||
param_name = key.replace(env_prefix, "").lower()
|
||||
component_definition["params"][param_name] = value
|
||||
logger.info(
|
||||
f"Param '{param_name}' of component '{name}' overwritten with environment variable '{key}' value '{value}'."
|
||||
"Param '%s' of component '%s' overwritten with environment variable '%s' value '%s'.",
|
||||
param_name,
|
||||
name,
|
||||
key,
|
||||
value,
|
||||
)
|
||||
return component_definitions
|
||||
|
||||
@ -291,11 +295,13 @@ def validate_schema(pipeline_config: Dict, strict_version_check: bool = False, e
|
||||
ok_to_ignore_version = pipeline_version == "ignore" and "rc" in __version__
|
||||
if not ok_to_ignore_version:
|
||||
logging.warning(
|
||||
f"This pipeline is version '{pipeline_version}', but you're using Haystack {__version__}\n"
|
||||
"This pipeline is version '%s', but you're using Haystack %s\n"
|
||||
"This might cause bugs and unexpected behaviors."
|
||||
"Please check out the release notes (https://github.com/deepset-ai/haystack/releases/latest), "
|
||||
"the documentation (https://haystack.deepset.ai/components/pipelines#yaml-file-definitions) "
|
||||
"and fix your configuration accordingly."
|
||||
"and fix your configuration accordingly.",
|
||||
pipeline_version,
|
||||
__version__,
|
||||
)
|
||||
|
||||
# Load the json schema, and create one if it doesn't exist yet
|
||||
@ -317,7 +323,8 @@ def validate_schema(pipeline_config: Dict, strict_version_check: bool = False, e
|
||||
if validation.instance["type"] not in loaded_custom_nodes:
|
||||
|
||||
logger.info(
|
||||
f"Missing definition for node of type {validation.instance['type']}. Looking into local classes..."
|
||||
"Missing definition for node of type %s. Looking into local classes...",
|
||||
validation.instance["type"],
|
||||
)
|
||||
missing_component_class = BaseComponent.get_subclass(validation.instance["type"])
|
||||
schema = inject_definition_in_schema(node_class=missing_component_class, schema=schema)
|
||||
|
@ -1092,8 +1092,10 @@ class EvaluationResult:
|
||||
query_answers = answers[answers["multilabel_id"] == multilabel_id]
|
||||
if answer_metric not in metrics:
|
||||
logger.warning(
|
||||
f"You specified an answer_metric={answer_metric} not available in calculated metrics={metrics.keys()}."
|
||||
f"Skipping collection of worst performing samples."
|
||||
"You specified an answer_metric=%s not available in calculated metrics=%s."
|
||||
"Skipping collection of worst performing samples.",
|
||||
answer_metric,
|
||||
metrics.keys(),
|
||||
)
|
||||
break
|
||||
if metrics[answer_metric] <= answer_metric_threshold:
|
||||
@ -1127,8 +1129,10 @@ class EvaluationResult:
|
||||
for multilabel_id, metrics in worst_df.iterrows():
|
||||
if document_metric not in metrics:
|
||||
logger.warning(
|
||||
f"You specified a document_metric={document_metric} not available in calculated metrics={metrics.keys()}."
|
||||
f"Skipping collection of worst performing samples."
|
||||
"You specified a document_metric=%s not available in calculated metrics=%s."
|
||||
"Skipping collection of worst performing samples.",
|
||||
document_metric,
|
||||
metrics.keys(),
|
||||
)
|
||||
break
|
||||
if metrics[document_metric] <= document_metric_threshold:
|
||||
@ -1185,9 +1189,9 @@ class EvaluationResult:
|
||||
document_relevance_criterion = answer_scope_to_doc_relevance_crit.get(answer_scope, document_scope)
|
||||
elif answer_scope in answer_scope_to_doc_relevance_crit.keys():
|
||||
logger.warning(
|
||||
f"You specified a non-answer document_scope together with a non-default answer_scope. "
|
||||
f"This may result in inconsistencies between answer and document metrics. "
|
||||
f"To enforce the same definition of correctness for both, document_scope must be one of {['answer', 'document_id_or_answer']}."
|
||||
"You specified a non-answer document_scope together with a non-default answer_scope. "
|
||||
"This may result in inconsistencies between answer and document metrics. "
|
||||
"To enforce the same definition of correctness for both, document_scope must be one of 'answer', 'document_id_or_answer'."
|
||||
)
|
||||
|
||||
return document_relevance_criterion # type: ignore[return-value]
|
||||
|
@ -250,7 +250,11 @@ def _write_telemetry_config():
|
||||
# show a log message if telemetry config is written for the first time
|
||||
if not CONFIG_PATH.is_file():
|
||||
logger.info(
|
||||
f"Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by calling disable_telemetry() or by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation page. More information at https://docs.haystack.deepset.ai/docs/telemetry"
|
||||
"Haystack sends anonymous usage data to understand the actual usage and steer dev efforts "
|
||||
"towards features that are most meaningful to users. You can opt-out at anytime by calling "
|
||||
"disable_telemetry() or by manually setting the environment variable "
|
||||
"HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation "
|
||||
"page. More information at https://docs.haystack.deepset.ai/docs/telemetry"
|
||||
)
|
||||
CONFIG_PATH.parents[0].mkdir(parents=True, exist_ok=True)
|
||||
user_id = _get_or_create_user_id()
|
||||
|
@ -420,7 +420,11 @@ class IndexClient:
|
||||
doc = response.json()
|
||||
else:
|
||||
logger.warning(
|
||||
f"Document {id} could not be fetched from deepset Cloud: HTTP {response.status_code} - {response.reason}\n{response.content.decode()}"
|
||||
"Document %s could not be fetched from deepset Cloud: HTTP %s - %s\n%s",
|
||||
id,
|
||||
response.status_code,
|
||||
response.reason,
|
||||
response.content.decode(),
|
||||
)
|
||||
return doc
|
||||
|
||||
@ -625,7 +629,9 @@ class PipelineClient:
|
||||
else:
|
||||
logger.info("Pipeline config '%s' is already deployed.", pipeline_config_name)
|
||||
logger.info(
|
||||
f"Search endpoint for pipeline config '{pipeline_config_name}' is up and running for you under {pipeline_url}"
|
||||
"Search endpoint for pipeline config '%s' is up and running for you under %s",
|
||||
pipeline_config_name,
|
||||
pipeline_url,
|
||||
)
|
||||
if show_curl_message:
|
||||
curl_cmd = (
|
||||
@ -925,7 +931,9 @@ class EvaluationSetClient:
|
||||
with open(file_path, "rb") as file:
|
||||
self.client.post(url=target_url, files={"file": (file_path.name, file, mime_type)})
|
||||
logger.info(
|
||||
f"Successfully uploaded evaluation set file {file_path}. You can access it now under evaluation set '{file_path.name}'."
|
||||
"Successfully uploaded evaluation set file %s. You can access it now under evaluation set '%s'.",
|
||||
file_path,
|
||||
file_path.name,
|
||||
)
|
||||
except DeepsetCloudError as e:
|
||||
logger.error("Error uploading evaluation set file %s: %s", file_path, e.args)
|
||||
|
@ -87,8 +87,9 @@ def stop_container(container_name, delete_container=False):
|
||||
status = subprocess.run([f"docker stop {container_name}"], shell=True)
|
||||
if status.returncode:
|
||||
logger.warning(
|
||||
f"Tried to stop {container_name} but this failed. "
|
||||
f"It is likely that there was no Docker container with the name {container_name}"
|
||||
"Tried to stop %s but this failed. It is likely that there was no Docker container with the name %s",
|
||||
container_name,
|
||||
container_name,
|
||||
)
|
||||
if delete_container:
|
||||
status = subprocess.run([f"docker rm {container_name}"], shell=True)
|
||||
|
@ -61,8 +61,8 @@ def print_answers(results: dict, details: str = "all", max_text_len: Optional[in
|
||||
filtered_answers = answers
|
||||
else:
|
||||
valid_values = ", ".join(fields_to_keep_by_level.keys()) + " and 'all'"
|
||||
logging.warn(f"print_answers received details='{details}', which was not understood. ")
|
||||
logging.warn(f"Valid values are {valid_values}. Using 'all'.")
|
||||
logging.warn("print_answers received details='%s', which was not understood. ", details)
|
||||
logging.warn("Valid values are %s. Using 'all'.", valid_values)
|
||||
filtered_answers = answers
|
||||
|
||||
# Shorten long text fields
|
||||
|
@ -50,8 +50,11 @@ def retry_with_exponential_backoff(
|
||||
|
||||
# Sleep for the delay
|
||||
logger.warning(
|
||||
f"{e.__class__.__name__ } - {e}, "
|
||||
f"retry {function.__name__} in {'{0:.2f}'.format(sleep_time)} seconds..."
|
||||
"%s - %s, retry %s in %s seconds...",
|
||||
e.__class__.__name__,
|
||||
e,
|
||||
function.__name__,
|
||||
"{0:.2f}".format(sleep_time),
|
||||
)
|
||||
time.sleep(sleep_time)
|
||||
|
||||
|
@ -178,7 +178,7 @@ def create_dpr_training_dataset(squad_data: dict, retriever: BaseRetriever, num_
|
||||
|
||||
if not hard_negative_ctxs or not positive_ctxs:
|
||||
logging.error(
|
||||
f"No retrieved candidates for article {article_title}, with question {question['question']}"
|
||||
"No retrieved candidates for article %s, with question %s", article_title, question["question"]
|
||||
)
|
||||
n_non_added_questions += 1
|
||||
continue
|
||||
|
@ -260,7 +260,6 @@ disable = [
|
||||
"too-few-public-methods",
|
||||
"raise-missing-from",
|
||||
"invalid-name",
|
||||
"logging-fstring-interpolation",
|
||||
"too-many-locals",
|
||||
"duplicate-code",
|
||||
"too-many-arguments",
|
||||
|
@ -171,8 +171,9 @@ def export_feedback(
|
||||
context = squad_label["paragraphs"][0]["context"]
|
||||
if not context[start : start + len(answer)] == answer:
|
||||
logger.error(
|
||||
f"Skipping invalid squad label as string via offsets "
|
||||
f"('{context[start:start + len(answer)]}') does not match answer string ('{answer}') "
|
||||
"Skipping invalid squad label as string via offsets ('%s') does not match answer string ('%s') ",
|
||||
context[start : start + len(answer)],
|
||||
answer,
|
||||
)
|
||||
export_data.append(squad_label)
|
||||
|
||||
|
@ -97,22 +97,27 @@ def _format_filters(filters):
|
||||
new_filters = {}
|
||||
if filters is None:
|
||||
logger.warning(
|
||||
f"Request with deprecated filter format ('\"filters\": null'). "
|
||||
f"Remove empty filters from params to be compliant with future versions"
|
||||
"Request with deprecated filter format ('\"filters\": null'). "
|
||||
"Remove empty filters from params to be compliant with future versions"
|
||||
)
|
||||
else:
|
||||
for key, values in filters.items():
|
||||
if values is None:
|
||||
logger.warning(
|
||||
f"Request with deprecated filter format ('{key}: null'). "
|
||||
f"Remove null values from filters to be compliant with future versions"
|
||||
"Request with deprecated filter format ('%s: null'). "
|
||||
"Remove null values from filters to be compliant with future versions",
|
||||
key,
|
||||
)
|
||||
continue
|
||||
|
||||
if not isinstance(values, list):
|
||||
logger.warning(
|
||||
f"Request with deprecated filter format ('{key}': {values}). "
|
||||
f"Change to '{key}':[{values}]' to be compliant with future versions"
|
||||
"Request with deprecated filter format ('%s': %s). "
|
||||
"Change to '%s':[%s]' to be compliant with future versions",
|
||||
key,
|
||||
values,
|
||||
key,
|
||||
values,
|
||||
)
|
||||
values = [values]
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user