mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-19 11:58:44 +00:00
chore: enable logging-fstring-interpolation
and cleanup (#3843)
* enable logging-fstring-interpolation * remove logging-fstring-interpolation from exclusion list * remove implicit string interpolations added by black * remove from rest_api too * fix % sign
This commit is contained in:
parent
4cbc8550d6
commit
d157e41c1f
@ -476,8 +476,9 @@ class BaseDocumentStore(BaseComponent):
|
|||||||
else:
|
else:
|
||||||
jsonl_filename = (file_path.parent / (file_path.stem + ".jsonl")).as_posix()
|
jsonl_filename = (file_path.parent / (file_path.stem + ".jsonl")).as_posix()
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Adding evaluation data batch-wise is not compatible with json-formatted SQuAD files. "
|
"Adding evaluation data batch-wise is not compatible with json-formatted SQuAD files. "
|
||||||
f"Converting json to jsonl to: {jsonl_filename}"
|
"Converting json to jsonl to: %s",
|
||||||
|
jsonl_filename,
|
||||||
)
|
)
|
||||||
squad_json_to_jsonl(filename, jsonl_filename)
|
squad_json_to_jsonl(filename, jsonl_filename)
|
||||||
self.add_eval_data(
|
self.add_eval_data(
|
||||||
@ -622,8 +623,9 @@ class BaseDocumentStore(BaseComponent):
|
|||||||
for document in documents:
|
for document in documents:
|
||||||
if document.id in _hash_ids:
|
if document.id in _hash_ids:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Duplicate Documents: Document with id '{document.id}' already exists in index "
|
"Duplicate Documents: Document with id '%s' already exists in index '%s'",
|
||||||
f"'{index or self.index}'"
|
document.id,
|
||||||
|
index or self.index,
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
_documents.append(document)
|
_documents.append(document)
|
||||||
|
@ -118,23 +118,25 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
|
|||||||
indexing_info = index_info["indexing"]
|
indexing_info = index_info["indexing"]
|
||||||
if indexing_info["pending_file_count"] > 0:
|
if indexing_info["pending_file_count"] > 0:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"{indexing_info['pending_file_count']} files are pending to be indexed. "
|
"%s files are pending to be indexed. Indexing status: %s",
|
||||||
f"Indexing status: {indexing_info['status']}"
|
indexing_info["pending_file_count"],
|
||||||
|
indexing_info["status"],
|
||||||
)
|
)
|
||||||
if index in deployed_unhealthy_pipelines:
|
if index in deployed_unhealthy_pipelines:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"The index '{index}' is unhealthy and should be redeployed using "
|
"The index '%s' is unhealthy and should be redeployed using "
|
||||||
f"`Pipeline.undeploy_on_deepset_cloud()` and `Pipeline.deploy_on_deepset_cloud()`."
|
"`Pipeline.undeploy_on_deepset_cloud()` and `Pipeline.deploy_on_deepset_cloud()`.",
|
||||||
|
index,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"You are using a DeepsetCloudDocumentStore with an index that does not exist on deepset Cloud. "
|
"You are using a DeepsetCloudDocumentStore with an index that does not exist on deepset Cloud. "
|
||||||
f"This document store always returns empty responses. This can be useful if you want to "
|
"This document store always returns empty responses. This can be useful if you want to "
|
||||||
f"create a new pipeline within deepset Cloud.\n"
|
"create a new pipeline within deepset Cloud.\n"
|
||||||
f"In order to create a new pipeline on deepset Cloud, take the following steps: \n"
|
"In order to create a new pipeline on deepset Cloud, take the following steps: \n"
|
||||||
f" - create query and indexing pipelines using this DocumentStore\n"
|
" - create query and indexing pipelines using this DocumentStore\n"
|
||||||
f" - call `Pipeline.save_to_deepset_cloud()` passing the pipelines and a `pipeline_config_name`\n"
|
" - call `Pipeline.save_to_deepset_cloud()` passing the pipelines and a `pipeline_config_name`\n"
|
||||||
f" - call `Pipeline.deploy_on_deepset_cloud()` passing the `pipeline_config_name`"
|
" - call `Pipeline.deploy_on_deepset_cloud()` passing the `pipeline_config_name`"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.evaluation_set_client = DeepsetCloud.get_evaluation_set_client(
|
self.evaluation_set_client = DeepsetCloud.get_evaluation_set_client(
|
||||||
|
@ -508,9 +508,10 @@ class ElasticsearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
|
|
||||||
if not any(indices):
|
if not any(indices):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"To use an index, you must create it first. The index called '{index_name}' doesn't exist. "
|
"To use an index, you must create it first. The index called '%s' doesn't exist. "
|
||||||
f"You can create it by setting `create_index=True` on init or by calling `write_documents()` if you prefer to create it on demand. "
|
"You can create it by setting `create_index=True` on init or by calling `write_documents()` if you prefer to create it on demand. "
|
||||||
f"Note that this instance doesn't validate the index after you create it."
|
"Note that this instance doesn't validate the index after you create it.",
|
||||||
|
index_name,
|
||||||
)
|
)
|
||||||
|
|
||||||
# If the index name is an alias that groups multiple existing indices, each of them must have an embedding_field.
|
# If the index name is an alias that groups multiple existing indices, each of them must have an embedding_field.
|
||||||
|
@ -206,7 +206,10 @@ class FAISSDocumentStore(SQLDocumentStore):
|
|||||||
index.hnsw.efConstruction = ef_construction
|
index.hnsw.efConstruction = ef_construction
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"HNSW params: n_links: {n_links}, efSearch: {index.hnsw.efSearch}, efConstruction: {index.hnsw.efConstruction}"
|
"HNSW params: n_links: %s, efSearch: %s, efConstruction: %s",
|
||||||
|
n_links,
|
||||||
|
index.hnsw.efSearch,
|
||||||
|
index.hnsw.efConstruction,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
index = faiss.index_factory(embedding_dim, index_factory, metric_type)
|
index = faiss.index_factory(embedding_dim, index_factory, metric_type)
|
||||||
@ -550,8 +553,10 @@ class FAISSDocumentStore(SQLDocumentStore):
|
|||||||
"""
|
"""
|
||||||
if index == self.index:
|
if index == self.index:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Deletion of default index '{index}' detected. "
|
"Deletion of default index '%s' detected. "
|
||||||
f"If you plan to use this index again, please reinstantiate '{self.__class__.__name__}' in order to avoid side-effects."
|
"If you plan to use this index again, please reinstantiate '%s' in order to avoid side-effects.",
|
||||||
|
index,
|
||||||
|
self.__class__.__name__,
|
||||||
)
|
)
|
||||||
if index in self.faiss_indexes:
|
if index in self.faiss_indexes:
|
||||||
del self.faiss_indexes[index]
|
del self.faiss_indexes[index]
|
||||||
|
@ -110,8 +110,9 @@ class InMemoryDocumentStore(KeywordDocumentStore):
|
|||||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=self.use_gpu, multi_gpu=False)
|
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=self.use_gpu, multi_gpu=False)
|
||||||
if len(self.devices) > 1:
|
if len(self.devices) > 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||||
f"using the first device {self.devices[0]}."
|
self.__class__.__name__,
|
||||||
|
self.devices[0],
|
||||||
)
|
)
|
||||||
|
|
||||||
self.main_device = self.devices[0]
|
self.main_device = self.devices[0]
|
||||||
@ -184,7 +185,7 @@ class InMemoryDocumentStore(KeywordDocumentStore):
|
|||||||
)
|
)
|
||||||
if duplicate_documents == "skip":
|
if duplicate_documents == "skip":
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Duplicate Documents: Document with id '{document.id} already exists in index " f"'{index}'"
|
"Duplicate Documents: Document with id '%s' already exists in index '%s'", document.id, index
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
self.indexes[index][document.id] = document
|
self.indexes[index][document.id] = document
|
||||||
@ -205,8 +206,9 @@ class InMemoryDocumentStore(KeywordDocumentStore):
|
|||||||
textual_documents = [doc for doc in all_documents if doc.content_type == "text"]
|
textual_documents = [doc for doc in all_documents if doc.content_type == "text"]
|
||||||
if len(textual_documents) < len(all_documents):
|
if len(textual_documents) < len(all_documents):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Some documents in {index} index are non-textual."
|
"Some documents in %s index are non-textual."
|
||||||
f" They will be written to the index, but the corresponding BM25 representations will not be generated."
|
" They will be written to the index, but the corresponding BM25 representations will not be generated.",
|
||||||
|
index,
|
||||||
)
|
)
|
||||||
|
|
||||||
tokenized_corpus = [
|
tokenized_corpus = [
|
||||||
@ -236,10 +238,11 @@ class InMemoryDocumentStore(KeywordDocumentStore):
|
|||||||
duplicate_ids: list = [label.id for label in self._get_duplicate_labels(label_objects, index=index)]
|
duplicate_ids: list = [label.id for label in self._get_duplicate_labels(label_objects, index=index)]
|
||||||
if len(duplicate_ids) > 0:
|
if len(duplicate_ids) > 0:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
|
"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
|
||||||
f" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
|
" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
|
||||||
f" the answer annotation and not the question."
|
" the answer annotation and not the question."
|
||||||
f" Problematic ids: {','.join(duplicate_ids)}"
|
" Problematic ids: %s",
|
||||||
|
",".join(duplicate_ids),
|
||||||
)
|
)
|
||||||
|
|
||||||
for label in label_objects:
|
for label in label_objects:
|
||||||
|
@ -497,8 +497,10 @@ class MilvusDocumentStore(SQLDocumentStore):
|
|||||||
"""
|
"""
|
||||||
if index == self.index:
|
if index == self.index:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Deletion of default index '{index}' detected. "
|
"Deletion of default index '%s' detected. "
|
||||||
f"If you plan to use this index again, please reinstantiate '{self.__class__.__name__}' in order to avoid side-effects."
|
"If you plan to use this index again, please reinstantiate '%s' in order to avoid side-effects.",
|
||||||
|
index,
|
||||||
|
self.__class__.__name__,
|
||||||
)
|
)
|
||||||
self._delete_index(index)
|
self._delete_index(index)
|
||||||
|
|
||||||
|
@ -527,9 +527,10 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
if not any(indices):
|
if not any(indices):
|
||||||
# We don't want to raise here as creating a query-only document store before the index being created asynchronously is a valid use case.
|
# We don't want to raise here as creating a query-only document store before the index being created asynchronously is a valid use case.
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Before you can use an index, you must create it first. The index '{index_name}' doesn't exist. "
|
"Before you can use an index, you must create it first. The index '%s' doesn't exist. "
|
||||||
f"You can create it by setting `create_index=True` on init or by calling `write_documents()` if you prefer to create it on demand. "
|
"You can create it by setting `create_index=True` on init or by calling `write_documents()` if you prefer to create it on demand. "
|
||||||
f"Note that this instance doesn't validate the index after you created it."
|
"Note that this instance doesn't validate the index after you created it.",
|
||||||
|
index_name,
|
||||||
)
|
)
|
||||||
|
|
||||||
# If the index name is an alias that groups multiple existing indices, each of them must have an embedding_field.
|
# If the index name is an alias that groups multiple existing indices, each of them must have an embedding_field.
|
||||||
@ -583,11 +584,11 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
if self.index_type == "hnsw" and ef_search != 20:
|
if self.index_type == "hnsw" and ef_search != 20:
|
||||||
body = {"knn.algo_param.ef_search": 20}
|
body = {"knn.algo_param.ef_search": 20}
|
||||||
self.client.indices.put_settings(index=index_id, body=body, headers=headers)
|
self.client.indices.put_settings(index=index_id, body=body, headers=headers)
|
||||||
logger.info(f"Set ef_search to 20 for hnsw index '{index_id}'.")
|
logger.info("Set ef_search to 20 for hnsw index '%s'.", index_id)
|
||||||
elif self.index_type == "flat" and ef_search != 512:
|
elif self.index_type == "flat" and ef_search != 512:
|
||||||
body = {"knn.algo_param.ef_search": 512}
|
body = {"knn.algo_param.ef_search": 512}
|
||||||
self.client.indices.put_settings(index=index_id, body=body, headers=headers)
|
self.client.indices.put_settings(index=index_id, body=body, headers=headers)
|
||||||
logger.info(f"Set ef_search to 512 for hnsw index '{index_id}'.")
|
logger.info("Set ef_search to 512 for hnsw index '%s'.", index_id)
|
||||||
|
|
||||||
def _validate_approximate_knn_settings(
|
def _validate_approximate_knn_settings(
|
||||||
self, existing_embedding_field: Dict[str, Any], index_settings: Dict[str, Any], index_id: str
|
self, existing_embedding_field: Dict[str, Any], index_settings: Dict[str, Any], index_id: str
|
||||||
|
@ -216,7 +216,10 @@ class SearchEngineDocumentStore(KeywordDocumentStore):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
if hasattr(e, "status_code") and e.status_code == 429: # type: ignore
|
if hasattr(e, "status_code") and e.status_code == 429: # type: ignore
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Failed to insert a batch of '{len(documents)}' documents because of a 'Too Many Requeset' response. Splitting the number of documents into two chunks with the same size and retrying in {_timeout} seconds."
|
"Failed to insert a batch of '%s' documents because of a 'Too Many Requeset' response. "
|
||||||
|
"Splitting the number of documents into two chunks with the same size and retrying in %s seconds.",
|
||||||
|
len(documents),
|
||||||
|
_timeout,
|
||||||
)
|
)
|
||||||
if len(documents) == 1:
|
if len(documents) == 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
@ -478,10 +481,11 @@ class SearchEngineDocumentStore(KeywordDocumentStore):
|
|||||||
duplicate_ids: list = [label.id for label in self._get_duplicate_labels(label_list, index=index)]
|
duplicate_ids: list = [label.id for label in self._get_duplicate_labels(label_list, index=index)]
|
||||||
if len(duplicate_ids) > 0:
|
if len(duplicate_ids) > 0:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
|
"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
|
||||||
f" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
|
" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
|
||||||
f" the answer annotation and not the question."
|
" the answer annotation and not the question."
|
||||||
f" Problematic ids: {','.join(duplicate_ids)}"
|
" Problematic ids: %s",
|
||||||
|
",".join(duplicate_ids),
|
||||||
)
|
)
|
||||||
labels_to_index = []
|
labels_to_index = []
|
||||||
for label in label_list:
|
for label in label_list:
|
||||||
@ -1087,7 +1091,8 @@ class SearchEngineDocumentStore(KeywordDocumentStore):
|
|||||||
if not isinstance(query, str):
|
if not isinstance(query, str):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"The query provided seems to be not a string, but an object "
|
"The query provided seems to be not a string, but an object "
|
||||||
f"of type {type(query)}. This can cause the query to fail."
|
"of type %s. This can cause the query to fail.",
|
||||||
|
type(query),
|
||||||
)
|
)
|
||||||
operator = "AND" if all_terms_must_match else "OR"
|
operator = "AND" if all_terms_must_match else "OR"
|
||||||
body = {
|
body = {
|
||||||
@ -1599,8 +1604,10 @@ class SearchEngineDocumentStore(KeywordDocumentStore):
|
|||||||
"""
|
"""
|
||||||
if index == self.index:
|
if index == self.index:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Deletion of default index '{index}' detected. "
|
"Deletion of default index '%s' detected. "
|
||||||
f"If you plan to use this index again, please reinstantiate '{self.__class__.__name__}' in order to avoid side-effects."
|
"If you plan to use this index again, please reinstantiate '%s' in order to avoid side-effects.",
|
||||||
|
index,
|
||||||
|
self.__class__.__name__,
|
||||||
)
|
)
|
||||||
self._delete_index(index)
|
self._delete_index(index)
|
||||||
|
|
||||||
|
@ -448,10 +448,11 @@ class SQLDocumentStore(BaseDocumentStore):
|
|||||||
duplicate_ids: list = [label.id for label in self._get_duplicate_labels(labels, index=index)]
|
duplicate_ids: list = [label.id for label in self._get_duplicate_labels(labels, index=index)]
|
||||||
if len(duplicate_ids) > 0:
|
if len(duplicate_ids) > 0:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
|
"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
|
||||||
f" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
|
" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
|
||||||
f" the answer annotation and not the question."
|
" the answer annotation and not the question."
|
||||||
f" Problematic ids: {','.join(duplicate_ids)}"
|
" Problematic ids: %s",
|
||||||
|
",".join(duplicate_ids),
|
||||||
)
|
)
|
||||||
# TODO: Use batch_size
|
# TODO: Use batch_size
|
||||||
|
|
||||||
|
@ -52,8 +52,9 @@ def eval_data_from_json(
|
|||||||
problematic_ids.extend(cur_problematic_ids)
|
problematic_ids.extend(cur_problematic_ids)
|
||||||
if len(problematic_ids) > 0:
|
if len(problematic_ids) > 0:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Could not convert an answer for {len(problematic_ids)} questions.\n"
|
"Could not convert an answer for %s questions.\nThere were conversion errors for question ids: %s",
|
||||||
f"There were conversion errors for question ids: {problematic_ids}"
|
len(problematic_ids),
|
||||||
|
problematic_ids,
|
||||||
)
|
)
|
||||||
return docs, labels
|
return docs, labels
|
||||||
|
|
||||||
@ -99,8 +100,10 @@ def eval_data_from_jsonl(
|
|||||||
if len(docs) >= batch_size:
|
if len(docs) >= batch_size:
|
||||||
if len(problematic_ids) > 0:
|
if len(problematic_ids) > 0:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Could not convert an answer for {len(problematic_ids)} questions.\n"
|
"Could not convert an answer for %s questions.\n"
|
||||||
f"There were conversion errors for question ids: {problematic_ids}"
|
"There were conversion errors for question ids: %s",
|
||||||
|
len(problematic_ids),
|
||||||
|
problematic_ids,
|
||||||
)
|
)
|
||||||
yield docs, labels
|
yield docs, labels
|
||||||
docs = []
|
docs = []
|
||||||
|
@ -358,7 +358,9 @@ class WeaviateDocumentStore(KeywordDocumentStore):
|
|||||||
generated_uuid = str(uuid.UUID(hashed_id.hexdigest()[::2]))
|
generated_uuid = str(uuid.UUID(hashed_id.hexdigest()[::2]))
|
||||||
if not self.uuid_format_warning_raised:
|
if not self.uuid_format_warning_raised:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Document id {id} is not in uuid format. Such ids will be replaced by uuids, in this case {generated_uuid}."
|
"Document id %s is not in uuid format. Such ids will be replaced by uuids, in this case %s.",
|
||||||
|
id,
|
||||||
|
generated_uuid,
|
||||||
)
|
)
|
||||||
self.uuid_format_warning_raised = True
|
self.uuid_format_warning_raised = True
|
||||||
id = generated_uuid
|
id = generated_uuid
|
||||||
@ -1507,8 +1509,10 @@ class WeaviateDocumentStore(KeywordDocumentStore):
|
|||||||
"""
|
"""
|
||||||
if index == self.index:
|
if index == self.index:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Deletion of default index '{index}' detected. "
|
"Deletion of default index '%s' detected. "
|
||||||
f"If you plan to use this index again, please reinstantiate '{self.__class__.__name__}' in order to avoid side-effects."
|
"If you plan to use this index again, please reinstantiate '%s' in order to avoid side-effects.",
|
||||||
|
index,
|
||||||
|
self.__class__.__name__,
|
||||||
)
|
)
|
||||||
self._delete_index(index)
|
self._delete_index(index)
|
||||||
|
|
||||||
|
@ -336,7 +336,9 @@ class DataSilo:
|
|||||||
logger.warning("No dev set created. Please adjust the dev_split parameter.")
|
logger.warning("No dev set created. Please adjust the dev_split parameter.")
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Took {len(dev_dataset)} samples out of train set to create dev set (dev split is roughly {self.processor.dev_split})"
|
"Took %s samples out of train set to create dev set (dev split is roughly %s)",
|
||||||
|
len(dev_dataset),
|
||||||
|
self.processor.dev_split,
|
||||||
)
|
)
|
||||||
|
|
||||||
def random_split_ConcatDataset(self, ds: ConcatDataset, lengths: List[int]):
|
def random_split_ConcatDataset(self, ds: ConcatDataset, lengths: List[int]):
|
||||||
@ -387,7 +389,7 @@ class DataSilo:
|
|||||||
clipped, ave_len, seq_lens, max_seq_len = self._calc_length_stats_biencoder()
|
clipped, ave_len, seq_lens, max_seq_len = self._calc_length_stats_biencoder()
|
||||||
else:
|
else:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Could not compute length statistics because 'input_ids' or 'query_input_ids' and 'passage_input_ids' are missing."
|
"Could not compute length statistics because 'input_ids' or 'query_input_ids' and 'passage_input_ids' are missing."
|
||||||
)
|
)
|
||||||
clipped = -1
|
clipped = -1
|
||||||
ave_len = -1
|
ave_len = -1
|
||||||
@ -416,11 +418,14 @@ class DataSilo:
|
|||||||
logger.info("Proportion clipped: {}".format(clipped))
|
logger.info("Proportion clipped: {}".format(clipped))
|
||||||
if clipped > 0.5:
|
if clipped > 0.5:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"[Haystack Tip] {round(clipped * 100, 1)}% of your samples got cut down to {max_seq_len} tokens. "
|
"[Haystack Tip] %s%% of your samples got cut down to %s tokens. "
|
||||||
"Consider increasing max_seq_len "
|
"Consider increasing max_seq_len "
|
||||||
f"(the maximum value allowed with the current model is max_seq_len={self.processor.tokenizer.model_max_length}, "
|
"(the maximum value allowed with the current model is max_seq_len=%s, "
|
||||||
"if this is not enough consider splitting the document in smaller units or changing the model). "
|
"if this is not enough consider splitting the document in smaller units or changing the model). "
|
||||||
"This will lead to higher memory consumption but is likely to improve your model performance"
|
"This will lead to higher memory consumption but is likely to improve your model performance",
|
||||||
|
round(clipped * 100, 1),
|
||||||
|
max_seq_len,
|
||||||
|
self.processor.tokenizer.model_max_length,
|
||||||
)
|
)
|
||||||
elif "query_input_ids" in self.tensor_names and "passage_input_ids" in self.tensor_names:
|
elif "query_input_ids" in self.tensor_names and "passage_input_ids" in self.tensor_names:
|
||||||
logger.info(
|
logger.info(
|
||||||
|
@ -59,9 +59,11 @@ def convert_features_to_dataset(features):
|
|||||||
base = check.ravel()[0]
|
base = check.ravel()[0]
|
||||||
if not np.issubdtype(type(base), np.integer):
|
if not np.issubdtype(type(base), np.integer):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Problem during conversion to torch tensors:\n"
|
"Problem during conversion to torch tensors:\n"
|
||||||
f"A non-integer value for feature '{t_name}' with a value of: "
|
"A non-integer value for feature '%s' with a value of: "
|
||||||
f"'{base}' will be converted to a torch tensor of dtype long."
|
"'%s' will be converted to a torch tensor of dtype long.",
|
||||||
|
t_name,
|
||||||
|
base,
|
||||||
)
|
)
|
||||||
except:
|
except:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
|
@ -38,10 +38,11 @@ def sample_to_features_text(sample, tasks, max_seq_len, tokenizer):
|
|||||||
|
|
||||||
if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]):
|
if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]):
|
||||||
logger.error(
|
logger.error(
|
||||||
f"FastTokenizer encoded sample {sample.clear_text['text']} to "
|
"FastTokenizer encoded sample %s to %s tokens, which differs "
|
||||||
f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs "
|
"from number of tokens produced in tokenize_with_metadata(). \n"
|
||||||
f"from number of tokens produced in tokenize_with_metadata(). \n"
|
"Further processing is likely to be wrong.",
|
||||||
f"Further processing is likely to be wrong."
|
sample.clear_text["text"],
|
||||||
|
len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# TODO It might be cleaner to adjust the data structure in sample.tokenized
|
# TODO It might be cleaner to adjust the data structure in sample.tokenized
|
||||||
|
@ -565,8 +565,9 @@ class SquadProcessor(Processor):
|
|||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Could not devide document into passages. Document: {basket.raw['document_text'][:200]}\n"
|
"Could not devide document into passages. Document: %s\nWith error: %s",
|
||||||
f"With error: {e}"
|
basket.raw["document_text"][:200],
|
||||||
|
e,
|
||||||
)
|
)
|
||||||
passage_spans = []
|
passage_spans = []
|
||||||
|
|
||||||
@ -663,8 +664,9 @@ class SquadProcessor(Processor):
|
|||||||
# check if answer string can be found in context
|
# check if answer string can be found in context
|
||||||
if answer_text not in doc_text:
|
if answer_text not in doc_text:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Answer '{answer['text']}' not contained in context.\n"
|
"Answer '%s' not contained in context.\n"
|
||||||
f"Example will not be converted for training/evaluation."
|
"Example will not be converted for training/evaluation.",
|
||||||
|
answer["text"],
|
||||||
)
|
)
|
||||||
error_in_answer = True
|
error_in_answer = True
|
||||||
label_idxs[i][0] = -100 # TODO remove this hack also from featurization
|
label_idxs[i][0] = -100 # TODO remove this hack also from featurization
|
||||||
@ -672,8 +674,10 @@ class SquadProcessor(Processor):
|
|||||||
break # Break loop around answers, so the error message is not shown multiple times
|
break # Break loop around answers, so the error message is not shown multiple times
|
||||||
if answer_indices.strip() != answer_text.strip():
|
if answer_indices.strip() != answer_text.strip():
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Answer using start/end indices is '{answer_indices}' while gold label text is '{answer_text}'.\n"
|
"Answer using start/end indices is '%s' while gold label text is '%s'.\n"
|
||||||
f"Example will not be converted for training/evaluation."
|
"Example will not be converted for training/evaluation.",
|
||||||
|
answer_indices,
|
||||||
|
answer_text,
|
||||||
)
|
)
|
||||||
error_in_answer = True
|
error_in_answer = True
|
||||||
label_idxs[i][0] = -100 # TODO remove this hack also from featurization
|
label_idxs[i][0] = -100 # TODO remove this hack also from featurization
|
||||||
@ -1025,7 +1029,7 @@ class TextSimilarityProcessor(Processor):
|
|||||||
|
|
||||||
if problematic_ids:
|
if problematic_ids:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"There were {len(problematic_ids)} errors during preprocessing at positions: {problematic_ids}"
|
"There were %s errors during preprocessing at positions: %s", len(problematic_ids), problematic_ids
|
||||||
)
|
)
|
||||||
|
|
||||||
if return_baskets:
|
if return_baskets:
|
||||||
@ -1104,7 +1108,7 @@ class TextSimilarityProcessor(Processor):
|
|||||||
|
|
||||||
if len(tokenized_query) == 0:
|
if len(tokenized_query) == 0:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"The query could not be tokenized, likely because it contains a character that the query tokenizer does not recognize"
|
"The query could not be tokenized, likely because it contains a character that the query tokenizer does not recognize"
|
||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -1222,7 +1226,8 @@ class TextSimilarityProcessor(Processor):
|
|||||||
if title is None:
|
if title is None:
|
||||||
title = ""
|
title = ""
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Couldn't find title although `embed_title` is set to True for DPR. Using title='' now. Related passage text: '{ctx}' "
|
"Couldn't find title although `embed_title` is set to True for DPR. Using title='' now. Related passage text: '%s' ",
|
||||||
|
ctx,
|
||||||
)
|
)
|
||||||
res.append(tuple((title, ctx)))
|
res.append(tuple((title, ctx)))
|
||||||
return res
|
return res
|
||||||
@ -1545,7 +1550,7 @@ class TableTextSimilarityProcessor(Processor):
|
|||||||
|
|
||||||
if problematic_ids:
|
if problematic_ids:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"There were {len(problematic_ids)} errors during preprocessing at positions: {problematic_ids}"
|
"There were %s errors during preprocessing at positions: %s", len(problematic_ids), problematic_ids
|
||||||
)
|
)
|
||||||
|
|
||||||
if return_baskets:
|
if return_baskets:
|
||||||
@ -1588,7 +1593,7 @@ class TableTextSimilarityProcessor(Processor):
|
|||||||
|
|
||||||
if len(tokenized_query) == 0:
|
if len(tokenized_query) == 0:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"The query could not be tokenized, likely because it contains a character that the query tokenizer does not recognize"
|
"The query could not be tokenized, likely because it contains a character that the query tokenizer does not recognize"
|
||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -125,7 +125,8 @@ class Evaluator:
|
|||||||
temperature_change = (abs(temperature_current - temperature_previous) / temperature_previous) * 100.0
|
temperature_change = (abs(temperature_current - temperature_previous) / temperature_previous) * 100.0
|
||||||
if temperature_change > 50:
|
if temperature_change > 50:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"temperature used for calibration of confidence scores changed by more than {temperature_change} percent"
|
"temperature used for calibration of confidence scores changed by more than %s percent",
|
||||||
|
temperature_change,
|
||||||
)
|
)
|
||||||
if hasattr(head, "aggregate_preds"):
|
if hasattr(head, "aggregate_preds"):
|
||||||
# Needed to convert NQ ids from np arrays to strings
|
# Needed to convert NQ ids from np arrays to strings
|
||||||
@ -146,8 +147,11 @@ class Evaluator:
|
|||||||
result["report"] = compute_report_metrics(head, preds_all[head_num], label_all[head_num])
|
result["report"] = compute_report_metrics(head, preds_all[head_num], label_all[head_num])
|
||||||
except:
|
except:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Couldn't create eval report for head {head_num} with following preds and labels:"
|
"Couldn't create eval report for head %s with following preds and labels:"
|
||||||
f"\n Preds: {preds_all[head_num]} \n Labels: {label_all[head_num]}"
|
"\n Preds: %s \n Labels: %s",
|
||||||
|
head_num,
|
||||||
|
preds_all[head_num],
|
||||||
|
label_all[head_num],
|
||||||
)
|
)
|
||||||
result["report"] = "Error"
|
result["report"] = "Error"
|
||||||
|
|
||||||
|
@ -77,8 +77,9 @@ class Inferencer:
|
|||||||
self.devices, n_gpu = initialize_device_settings(devices=devices, use_cuda=gpu, multi_gpu=False)
|
self.devices, n_gpu = initialize_device_settings(devices=devices, use_cuda=gpu, multi_gpu=False)
|
||||||
if len(self.devices) > 1:
|
if len(self.devices) > 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||||
f"using the first device {self.devices[0]}."
|
self.__class__.__name__,
|
||||||
|
self.devices[0],
|
||||||
)
|
)
|
||||||
|
|
||||||
self.processor = processor
|
self.processor = processor
|
||||||
@ -187,9 +188,7 @@ class Inferencer:
|
|||||||
|
|
||||||
devices, n_gpu = initialize_device_settings(devices=devices, use_cuda=gpu, multi_gpu=False)
|
devices, n_gpu = initialize_device_settings(devices=devices, use_cuda=gpu, multi_gpu=False)
|
||||||
if len(devices) > 1:
|
if len(devices) > 1:
|
||||||
logger.warning(
|
logger.warning("Multiple devices are not supported in Inferencer, using the first device %s.", devices[0])
|
||||||
f"Multiple devices are not supported in Inferencer, " f"using the first device {devices[0]}."
|
|
||||||
)
|
|
||||||
|
|
||||||
name = os.path.basename(model_name_or_path)
|
name = os.path.basename(model_name_or_path)
|
||||||
|
|
||||||
|
@ -390,8 +390,9 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
|
|||||||
for prediction_head in self.prediction_heads:
|
for prediction_head in self.prediction_heads:
|
||||||
if len(prediction_head.layer_dims) != 2:
|
if len(prediction_head.layer_dims) != 2:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Currently conversion only works for PredictionHeads that are a single layer Feed Forward NN with dimensions [LM_output_dim, number_classes].\n"
|
"Currently conversion only works for PredictionHeads that are a single layer Feed Forward NN with dimensions [LM_output_dim, number_classes].\n"
|
||||||
f" Your PredictionHead has {str(prediction_head.layer_dims)} dimensions."
|
" Your PredictionHead has %s dimensions.",
|
||||||
|
str(prediction_head.layer_dims),
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
if prediction_head.model_type == "span_classification":
|
if prediction_head.model_type == "span_classification":
|
||||||
@ -399,8 +400,8 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
|
|||||||
converted_models.append(transformers_model)
|
converted_models.append(transformers_model)
|
||||||
else:
|
else:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Haystack -> Transformers conversion is not supported yet for"
|
"Haystack -> Transformers conversion is not supported yet for prediction heads of type %s",
|
||||||
f" prediction heads of type {prediction_head.model_type}"
|
prediction_head.model_type,
|
||||||
)
|
)
|
||||||
|
|
||||||
return converted_models
|
return converted_models
|
||||||
|
@ -93,7 +93,7 @@ class FeatureExtractor:
|
|||||||
with open(config_file) as f:
|
with open(config_file) as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
feature_extractor_classname = config["tokenizer_class"]
|
feature_extractor_classname = config["tokenizer_class"]
|
||||||
logger.debug(f"⛏️ Selected feature extractor: {feature_extractor_classname} (from {config_file})")
|
logger.debug("⛏️ Selected feature extractor: %s (from %s)", feature_extractor_classname, config_file)
|
||||||
# Use FastTokenizers as much as possible
|
# Use FastTokenizers as much as possible
|
||||||
try:
|
try:
|
||||||
feature_extractor_class = getattr(transformers, feature_extractor_classname + "Fast")
|
feature_extractor_class = getattr(transformers, feature_extractor_classname + "Fast")
|
||||||
@ -122,7 +122,7 @@ class FeatureExtractor:
|
|||||||
f"\n- {f'{chr(10)}- '.join(FEATURE_EXTRACTORS.keys())}"
|
f"\n- {f'{chr(10)}- '.join(FEATURE_EXTRACTORS.keys())}"
|
||||||
) from e
|
) from e
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"⛏️ Selected feature extractor: {feature_extractor_class.__name__} (for model type '{model_type}')"
|
"⛏️ Selected feature extractor: %s (for model type '%s')", feature_extractor_class.__name__, model_type
|
||||||
)
|
)
|
||||||
|
|
||||||
self.default_params = DEFAULT_EXTRACTION_PARAMS.get(feature_extractor_class, {})
|
self.default_params = DEFAULT_EXTRACTION_PARAMS.get(feature_extractor_class, {})
|
||||||
|
@ -293,7 +293,7 @@ class HFLanguageModel(LanguageModel):
|
|||||||
model_emb_size = self.model.resize_token_embeddings(new_num_tokens=None).num_embeddings
|
model_emb_size = self.model.resize_token_embeddings(new_num_tokens=None).num_embeddings
|
||||||
vocab_size = model_emb_size + n_added_tokens
|
vocab_size = model_emb_size + n_added_tokens
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Resizing embedding layer of LM from {model_emb_size} to {vocab_size} to cope with custom vocab."
|
"Resizing embedding layer of LM from %s to %s to cope with custom vocab.", model_emb_size, vocab_size
|
||||||
)
|
)
|
||||||
self.model.resize_token_embeddings(vocab_size)
|
self.model.resize_token_embeddings(vocab_size)
|
||||||
# verify
|
# verify
|
||||||
@ -464,7 +464,7 @@ class HFLanguageModelNoSegmentIds(HFLanguageModelWithPooler):
|
|||||||
specified using the arguments `output_hidden_states` and `output_attentions`.
|
specified using the arguments `output_hidden_states` and `output_attentions`.
|
||||||
"""
|
"""
|
||||||
if segment_ids is not None:
|
if segment_ids is not None:
|
||||||
logger.warning(f"'segment_ids' is not None, but %s does not use them. They will be ignored.", self.name)
|
logger.warning("'segment_ids' is not None, but %s does not use them. They will be ignored.", self.name)
|
||||||
|
|
||||||
return super().forward(
|
return super().forward(
|
||||||
input_ids=input_ids,
|
input_ids=input_ids,
|
||||||
@ -636,8 +636,9 @@ class DPREncoder(LanguageModel):
|
|||||||
"""
|
"""
|
||||||
if model_config.model_type.lower() != "bert":
|
if model_config.model_type.lower() != "bert":
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Using a model of type '{model_config.model_type}' which might be incompatible with DPR encoders. "
|
"Using a model of type '%s' which might be incompatible with DPR encoders. "
|
||||||
f"Only Bert-based encoders are supported. They need input_ids, token_type_ids, attention_mask as input tensors."
|
"Only Bert-based encoders are supported. They need input_ids, token_type_ids, attention_mask as input tensors.",
|
||||||
|
model_config.model_type,
|
||||||
)
|
)
|
||||||
config_dict = vars(model_config)
|
config_dict = vars(model_config)
|
||||||
if model_kwargs:
|
if model_kwargs:
|
||||||
@ -876,12 +877,13 @@ def get_language_model(
|
|||||||
|
|
||||||
if not model_type:
|
if not model_type:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Model type not understood for '{pretrained_model_name_or_path}' "
|
"Model type not understood for '%s' (%s). "
|
||||||
f"({model_type if model_type else 'model_type not set'}). "
|
|
||||||
"Either supply the local path for a saved model, "
|
"Either supply the local path for a saved model, "
|
||||||
"or the name of a model that can be downloaded from the Model Hub. "
|
"or the name of a model that can be downloaded from the Model Hub. "
|
||||||
"Ensure that the model class name can be inferred from the directory name "
|
"Ensure that the model class name can be inferred from the directory name "
|
||||||
"when loading a Transformers model."
|
"when loading a Transformers model.",
|
||||||
|
pretrained_model_name_or_path,
|
||||||
|
model_type if model_type else "model_type not set",
|
||||||
)
|
)
|
||||||
logger.error("Using the AutoModel class for '%s'. This can cause crashes!", pretrained_model_name_or_path)
|
logger.error("Using the AutoModel class for '%s'. This can cause crashes!", pretrained_model_name_or_path)
|
||||||
model_type = "Auto"
|
model_type = "Auto"
|
||||||
@ -957,7 +959,7 @@ def _get_model_type(
|
|||||||
|
|
||||||
if model_type and model_type.lower() == "roberta" and "mlm" in model_name_or_path.lower():
|
if model_type and model_type.lower() == "roberta" and "mlm" in model_name_or_path.lower():
|
||||||
logger.error(
|
logger.error(
|
||||||
f"MLM part of codebert is currently not supported in Haystack: '{model_name_or_path}' may crash later."
|
"MLM part of codebert is currently not supported in Haystack: '%s' may crash later.", model_name_or_path
|
||||||
)
|
)
|
||||||
|
|
||||||
return model_type
|
return model_type
|
||||||
|
@ -88,13 +88,14 @@ def get_model(
|
|||||||
config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name, **autoconfig_kwargs)
|
config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name, **autoconfig_kwargs)
|
||||||
model_type = config.model_type
|
model_type = config.model_type
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Can't find model type for {pretrained_model_name_or_path}: {e}")
|
logger.debug("Can't find model type for %s: %s", pretrained_model_name_or_path, e)
|
||||||
|
|
||||||
if feature_extractor_kwargs is not None:
|
if feature_extractor_kwargs is not None:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Can't forward feature_extractor_kwargs to a SentenceTransformers model. "
|
"Can't forward feature_extractor_kwargs to a SentenceTransformers model. "
|
||||||
"These kwargs are being dropped. "
|
"These kwargs are being dropped. "
|
||||||
f"Content of feature_extractor_kwargs: {feature_extractor_kwargs}"
|
"Content of feature_extractor_kwargs: %s",
|
||||||
|
feature_extractor_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@ -102,9 +103,10 @@ def get_model(
|
|||||||
config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name, **autoconfig_kwargs)
|
config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name, **autoconfig_kwargs)
|
||||||
if not config.model_type:
|
if not config.model_type:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Model type not understood for '{pretrained_model_name_or_path}'. Please provide the name of "
|
"Model type not understood for '%s'. Please provide the name of "
|
||||||
"a model that can be downloaded from the Model Hub.\nUsing the AutoModel class. "
|
"a model that can be downloaded from the Model Hub.\nUsing the AutoModel class. "
|
||||||
"THIS CAN CAUSE CRASHES and won't work for models that are not working with text."
|
"THIS CAN CAUSE CRASHES and won't work for models that are not working with text.",
|
||||||
|
pretrained_model_name_or_path,
|
||||||
)
|
)
|
||||||
model_type = None
|
model_type = None
|
||||||
else:
|
else:
|
||||||
@ -112,10 +114,13 @@ def get_model(
|
|||||||
model_type = HUGGINGFACE_CAPITALIZE[config.model_type.lower()]
|
model_type = HUGGINGFACE_CAPITALIZE[config.model_type.lower()]
|
||||||
except KeyError as e:
|
except KeyError as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Haystack doesn't support model '{pretrained_model_name_or_path}' (type '{config.model_type.lower()}') "
|
"Haystack doesn't support model '%s' (type '%s') "
|
||||||
"We'll use the AutoModel class for it. "
|
"We'll use the AutoModel class for it. "
|
||||||
"THIS CAN CAUSE CRASHES and won't work for models that are not working with text. "
|
"THIS CAN CAUSE CRASHES and won't work for models that are not working with text. "
|
||||||
f"Supported model types: {', '.join(HUGGINGFACE_CAPITALIZE.keys())}"
|
"Supported model types: %s",
|
||||||
|
pretrained_model_name_or_path,
|
||||||
|
config.model_type.lower(),
|
||||||
|
", ".join(HUGGINGFACE_CAPITALIZE.keys()),
|
||||||
)
|
)
|
||||||
model_type = None
|
model_type = None
|
||||||
|
|
||||||
|
@ -25,9 +25,11 @@ class HaystackModel(ABC):
|
|||||||
See the values of `haystack.schema.ContentTypes`.
|
See the values of `haystack.schema.ContentTypes`.
|
||||||
"""
|
"""
|
||||||
logger.info(
|
logger.info(
|
||||||
f" 🤖 Loading '{pretrained_model_name_or_path}' "
|
" 🤖 Loading '%s' (%s of type '%s' for %s data)",
|
||||||
f"({self.__class__.__name__} of type '{model_type if model_type else '<unknown>'}' "
|
pretrained_model_name_or_path,
|
||||||
f"for {content_type} data)"
|
self.__class__.__name__,
|
||||||
|
model_type if model_type else "<unknown>",
|
||||||
|
content_type,
|
||||||
)
|
)
|
||||||
self.model_name_or_path = pretrained_model_name_or_path
|
self.model_name_or_path = pretrained_model_name_or_path
|
||||||
self.model_type = model_type
|
self.model_type = model_type
|
||||||
|
@ -164,8 +164,11 @@ class PredictionHead(nn.Module):
|
|||||||
return
|
return
|
||||||
new_dims = [input_dim] + old_dims[1:]
|
new_dims = [input_dim] + old_dims[1:]
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Resizing input dimensions of {type(self).__name__} ({self.task_name}) "
|
"Resizing input dimensions of %s (%s) from %s to %s to match language model",
|
||||||
f"from {old_dims} to {new_dims} to match language model"
|
type(self).__name__,
|
||||||
|
self.task_name,
|
||||||
|
old_dims,
|
||||||
|
new_dims,
|
||||||
)
|
)
|
||||||
self.feed_forward = FeedForwardBlock(new_dims)
|
self.feed_forward = FeedForwardBlock(new_dims)
|
||||||
self.layer_dims[0] = input_dim
|
self.layer_dims[0] = input_dim
|
||||||
@ -260,8 +263,8 @@ class QuestionAnsweringHead(PredictionHead):
|
|||||||
super(QuestionAnsweringHead, self).__init__()
|
super(QuestionAnsweringHead, self).__init__()
|
||||||
if len(kwargs) > 0:
|
if len(kwargs) > 0:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Some unused parameters are passed to the QuestionAnsweringHead. "
|
"Some unused parameters are passed to the QuestionAnsweringHead. Might not be a problem. Params: %s",
|
||||||
f"Might not be a problem. Params: {json.dumps(kwargs)}"
|
json.dumps(kwargs),
|
||||||
)
|
)
|
||||||
self.layer_dims = layer_dims
|
self.layer_dims = layer_dims
|
||||||
assert self.layer_dims[-1] == 2
|
assert self.layer_dims[-1] == 2
|
||||||
|
@ -105,20 +105,23 @@ class QACandidate:
|
|||||||
self.answer = "no_answer"
|
self.answer = "no_answer"
|
||||||
if self.offset_answer_start != 0 or self.offset_answer_end != 0:
|
if self.offset_answer_start != 0 or self.offset_answer_end != 0:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Both start and end offsets should be 0: \n"
|
"Both start and end offsets should be 0: \n%s, %s with a no_answer. ",
|
||||||
f"{self.offset_answer_start}, {self.offset_answer_end} with a no_answer. "
|
self.offset_answer_start,
|
||||||
|
self.offset_answer_end,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.answer = string
|
self.answer = string
|
||||||
if self.offset_answer_end - self.offset_answer_start <= 0:
|
if self.offset_answer_end - self.offset_answer_start <= 0:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"End offset comes before start offset: \n"
|
"End offset comes before start offset: \n(%s, %s) with a span answer. ",
|
||||||
f"({self.offset_answer_start}, {self.offset_answer_end}) with a span answer. "
|
self.offset_answer_start,
|
||||||
|
self.offset_answer_end,
|
||||||
)
|
)
|
||||||
elif self.offset_answer_end <= 0:
|
elif self.offset_answer_end <= 0:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Invalid end offset: \n"
|
"Invalid end offset: \n(%s, %s) with a span answer. ",
|
||||||
f"({self.offset_answer_start}, {self.offset_answer_end}) with a span answer. "
|
self.offset_answer_start,
|
||||||
|
self.offset_answer_end,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _create_context_window(self, context_window_size: int, clear_text: str) -> Tuple[str, int, int]:
|
def _create_context_window(self, context_window_size: int, clear_text: str) -> Tuple[str, int, int]:
|
||||||
@ -167,7 +170,8 @@ class QACandidate:
|
|||||||
"""
|
"""
|
||||||
if self.offset_unit != "token":
|
if self.offset_unit != "token":
|
||||||
logger.error(
|
logger.error(
|
||||||
f"QACandidate needs to have self.offset_unit=token before calling _span_to_string() (id = {self.passage_id})"
|
"QACandidate needs to have self.offset_unit=token before calling _span_to_string() (id = %s)",
|
||||||
|
self.passage_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
start_t = self.offset_answer_start
|
start_t = self.offset_answer_start
|
||||||
|
@ -104,8 +104,10 @@ class Trainer:
|
|||||||
if use_amp in amp_mapping:
|
if use_amp in amp_mapping:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"The Trainer only supports native PyTorch automatic mixed precision and no longer supports the Apex library.\n"
|
"The Trainer only supports native PyTorch automatic mixed precision and no longer supports the Apex library.\n"
|
||||||
f"Because you provided Apex optimization level {use_amp}, automatic mixed precision was set to {amp_mapping[use_amp]}.\n"
|
"Because you provided Apex optimization level %s, automatic mixed precision was set to %s.\n"
|
||||||
"In the future, set `use_amp=True` to turn on automatic mixed precision."
|
"In the future, set `use_amp=True` to turn on automatic mixed precision.",
|
||||||
|
use_amp,
|
||||||
|
amp_mapping[use_amp],
|
||||||
)
|
)
|
||||||
use_amp = amp_mapping[use_amp]
|
use_amp = amp_mapping[use_amp]
|
||||||
else:
|
else:
|
||||||
@ -570,8 +572,11 @@ class Trainer:
|
|||||||
if ranks_with_data < torch.distributed.get_world_size():
|
if ranks_with_data < torch.distributed.get_world_size():
|
||||||
if step is not None:
|
if step is not None:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Stopping epoch {self.from_epoch} at step {step} for rank {self.local_rank} since at least one other rank "
|
"Stopping epoch %s at step %s for rank %s since at least one other rank "
|
||||||
f"(~ one GPU) in distributed training doesn't have any more batches... "
|
"(~ one GPU) in distributed training doesn't have any more batches... ",
|
||||||
|
self.from_epoch,
|
||||||
|
step,
|
||||||
|
self.local_rank,
|
||||||
)
|
)
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
|
@ -191,13 +191,15 @@ class OpenAIAnswerGenerator(BaseGenerator):
|
|||||||
|
|
||||||
if len(input_docs) == 0:
|
if len(input_docs) == 0:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Skipping all of the provided Documents, as none of them fits the maximum token limit of "
|
"Skipping all of the provided Documents, as none of them fits the maximum token limit of %s"
|
||||||
f"{self.MAX_TOKENS_LIMIT}. The generated answers will therefore not be conditioned on any context."
|
"The generated answers will therefore not be conditioned on any context.",
|
||||||
|
self.MAX_TOKENS_LIMIT,
|
||||||
)
|
)
|
||||||
elif skipped_docs >= 1:
|
elif skipped_docs >= 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Skipping {skipped_docs} of the provided Documents, as using them would exceed the maximum token "
|
"Skipping %s of the provided Documents, as using them would exceed the maximum token limit of %s.",
|
||||||
f"limit of {self.MAX_TOKENS_LIMIT}."
|
skipped_docs,
|
||||||
|
self.MAX_TOKENS_LIMIT,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Top ranked documents should go at the end
|
# Top ranked documents should go at the end
|
||||||
|
@ -131,8 +131,9 @@ class RAGenerator(BaseGenerator):
|
|||||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||||
if len(self.devices) > 1:
|
if len(self.devices) > 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||||
f"using the first device {self.devices[0]}."
|
self.__class__.__name__,
|
||||||
|
self.devices[0],
|
||||||
)
|
)
|
||||||
|
|
||||||
self.tokenizer = RagTokenizer.from_pretrained(model_name_or_path, use_auth_token=use_auth_token)
|
self.tokenizer = RagTokenizer.from_pretrained(model_name_or_path, use_auth_token=use_auth_token)
|
||||||
@ -389,8 +390,9 @@ class Seq2SeqGenerator(BaseGenerator):
|
|||||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||||
if len(self.devices) > 1:
|
if len(self.devices) > 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||||
f"using the first device {self.devices[0]}."
|
self.__class__.__name__,
|
||||||
|
self.devices[0],
|
||||||
)
|
)
|
||||||
|
|
||||||
Seq2SeqGenerator._register_converters(model_name_or_path, input_converter)
|
Seq2SeqGenerator._register_converters(model_name_or_path, input_converter)
|
||||||
|
@ -52,8 +52,9 @@ class TextToSpeech:
|
|||||||
resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||||
if len(resolved_devices) > 1:
|
if len(resolved_devices) > 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||||
f"using the first device {resolved_devices[0]}."
|
self.__class__.__name__,
|
||||||
|
resolved_devices[0],
|
||||||
)
|
)
|
||||||
|
|
||||||
self.model = _Text2SpeechModel.from_pretrained(
|
self.model = _Text2SpeechModel.from_pretrained(
|
||||||
|
@ -311,7 +311,12 @@ class Crawler(BaseComponent):
|
|||||||
json.dump(document.to_dict(), f)
|
json.dump(document.to_dict(), f)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.exception(
|
logging.exception(
|
||||||
f"Crawler can't save the content of '{link}' under '{file_path}'. This webpage will be skipped, but links from this page will still be crawled. Make sure the path above is accessible and the file name is valid. If the file name is invalid, consider setting 'crawler_naming_function' to another function."
|
"Crawler can't save the content of '%s' under '%s'. "
|
||||||
|
"This webpage will be skipped, but links from this page will still be crawled. "
|
||||||
|
"Make sure the path above is accessible and the file name is valid. "
|
||||||
|
"If the file name is invalid, consider setting 'crawler_naming_function' to another function.",
|
||||||
|
link,
|
||||||
|
file_path,
|
||||||
)
|
)
|
||||||
|
|
||||||
paths.append(file_path)
|
paths.append(file_path)
|
||||||
|
@ -123,15 +123,17 @@ class TransformersDocumentClassifier(BaseDocumentClassifier):
|
|||||||
|
|
||||||
if labels and task == "text-classification":
|
if labels and task == "text-classification":
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Provided labels {labels} will be ignored for task text-classification. Set task to "
|
"Provided labels %s will be ignored for task text-classification. Set task to "
|
||||||
f"zero-shot-classification to use labels."
|
"zero-shot-classification to use labels.",
|
||||||
|
labels,
|
||||||
)
|
)
|
||||||
|
|
||||||
resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||||
if len(resolved_devices) > 1:
|
if len(resolved_devices) > 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||||
f"using the first device {resolved_devices[0]}."
|
self.__class__.__name__,
|
||||||
|
resolved_devices[0],
|
||||||
)
|
)
|
||||||
|
|
||||||
if tokenizer is None:
|
if tokenizer is None:
|
||||||
|
@ -75,16 +75,18 @@ class EvalDocuments(BaseComponent):
|
|||||||
self.top_k_used = top_k
|
self.top_k_used = top_k
|
||||||
elif self.top_k_used != top_k:
|
elif self.top_k_used != top_k:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"EvalDocuments was last run with top_k_eval_documents={self.top_k_used} but is "
|
"EvalDocuments was last run with top_k_eval_documents=%s} but is "
|
||||||
f"being run again with top_k={self.top_k}. "
|
"being run again with top_k=%s. "
|
||||||
f"The evaluation counter is being reset from this point so that the evaluation "
|
"The evaluation counter is being reset from this point so that the evaluation "
|
||||||
f"metrics are interpretable."
|
"metrics are interpretable.",
|
||||||
|
self.top_k_used,
|
||||||
|
self.top_k,
|
||||||
)
|
)
|
||||||
self.init_counts()
|
self.init_counts()
|
||||||
|
|
||||||
if len(documents) < top_k and not self.too_few_docs_warning:
|
if len(documents) < top_k and not self.too_few_docs_warning:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"EvalDocuments is being provided less candidate documents than top_k " f"(currently set to {top_k})."
|
"EvalDocuments is being provided less candidate documents than top_k (currently set to %s).", top_k
|
||||||
)
|
)
|
||||||
self.too_few_docs_warning = True
|
self.too_few_docs_warning = True
|
||||||
|
|
||||||
|
@ -123,8 +123,9 @@ class EntityExtractor(BaseComponent):
|
|||||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||||
if len(self.devices) > 1:
|
if len(self.devices) > 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||||
f"using the first device {self.devices[0]}."
|
self.__class__.__name__,
|
||||||
|
self.devices[0],
|
||||||
)
|
)
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
self.progress_bar = progress_bar
|
self.progress_bar = progress_bar
|
||||||
|
@ -63,8 +63,9 @@ class FileTypeClassifier(BaseComponent):
|
|||||||
return mimetypes.guess_extension(extension) or ""
|
return mimetypes.guess_extension(extension) or ""
|
||||||
except NameError as ne:
|
except NameError as ne:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"The type of '{file_path}' could not be guessed, probably because 'python-magic' is not installed. Ignoring this error."
|
"The type of '%s' could not be guessed, probably because 'python-magic' is not installed. Ignoring this error."
|
||||||
"Please make sure the necessary OS libraries are installed if you need this functionality ('python-magic' or 'python-magic-bin' on Windows)."
|
"Please make sure the necessary OS libraries are installed if you need this functionality ('python-magic' or 'python-magic-bin' on Windows).",
|
||||||
|
file_path,
|
||||||
)
|
)
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
@ -201,8 +201,10 @@ class AzureConverter(BaseConverter):
|
|||||||
file_text += f" {cell}"
|
file_text += f" {cell}"
|
||||||
if not self.validate_language(file_text, valid_languages):
|
if not self.validate_language(file_text, valid_languages):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"The language for {file_path} is not one of {valid_languages}. The file may not have "
|
"The language for %s is not one of %s. The file may not have "
|
||||||
f"been decoded in the correct text format."
|
"been decoded in the correct text format.",
|
||||||
|
file_path,
|
||||||
|
valid_languages,
|
||||||
)
|
)
|
||||||
|
|
||||||
return docs
|
return docs
|
||||||
|
@ -146,8 +146,9 @@ class ImageToTextConverter(BaseConverter):
|
|||||||
document_text = "".join(cleaned_pages)
|
document_text = "".join(cleaned_pages)
|
||||||
if not self.validate_language(document_text, valid_languages):
|
if not self.validate_language(document_text, valid_languages):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"The language for image is not one of {valid_languages}. The file may not have "
|
"The language for image is not one of %s. The file may not have "
|
||||||
f"been decoded in the correct text format."
|
"been decoded in the correct text format.",
|
||||||
|
valid_languages,
|
||||||
)
|
)
|
||||||
|
|
||||||
text = "\f".join(cleaned_pages)
|
text = "\f".join(cleaned_pages)
|
||||||
|
@ -200,8 +200,10 @@ class ParsrConverter(BaseConverter):
|
|||||||
file_text += f" {cell}"
|
file_text += f" {cell}"
|
||||||
if not self.validate_language(file_text, valid_languages):
|
if not self.validate_language(file_text, valid_languages):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"The language for {file_path} is not one of {valid_languages}. The file may not have "
|
"The language for %s is not one of %s. The file may not have "
|
||||||
f"been decoded in the correct text format."
|
"been decoded in the correct text format.",
|
||||||
|
file_path,
|
||||||
|
valid_languages,
|
||||||
)
|
)
|
||||||
|
|
||||||
if extract_headlines:
|
if extract_headlines:
|
||||||
|
@ -150,8 +150,10 @@ class PDFToTextConverter(BaseConverter):
|
|||||||
document_text = "".join(cleaned_pages)
|
document_text = "".join(cleaned_pages)
|
||||||
if not self.validate_language(document_text, valid_languages):
|
if not self.validate_language(document_text, valid_languages):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"The language for {file_path} is not one of {valid_languages}. The file may not have "
|
"The language for %s is not one of %s. The file may not have "
|
||||||
f"been decoded in the correct text format."
|
"been decoded in the correct text format.",
|
||||||
|
file_path,
|
||||||
|
valid_languages,
|
||||||
)
|
)
|
||||||
|
|
||||||
text = "\f".join(cleaned_pages)
|
text = "\f".join(cleaned_pages)
|
||||||
|
@ -172,8 +172,10 @@ class TikaConverter(BaseConverter):
|
|||||||
document_text = "".join(cleaned_pages)
|
document_text = "".join(cleaned_pages)
|
||||||
if not self.validate_language(document_text, valid_languages):
|
if not self.validate_language(document_text, valid_languages):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"The language for {file_path} is not one of {valid_languages}. The file may not have "
|
"The language for %s is not one of %s. The file may not have "
|
||||||
f"been decoded in the correct text format."
|
"been decoded in the correct text format.",
|
||||||
|
file_path,
|
||||||
|
valid_languages,
|
||||||
)
|
)
|
||||||
|
|
||||||
text = "\f".join(cleaned_pages)
|
text = "\f".join(cleaned_pages)
|
||||||
|
@ -75,8 +75,10 @@ class TextConverter(BaseConverter):
|
|||||||
document_text = "".join(cleaned_pages)
|
document_text = "".join(cleaned_pages)
|
||||||
if not self.validate_language(document_text, valid_languages):
|
if not self.validate_language(document_text, valid_languages):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"The language for {file_path} is not one of {valid_languages}. The file may not have "
|
"The language for %s is not one of %s. The file may not have "
|
||||||
f"been decoded in the correct text format."
|
"been decoded in the correct text format.",
|
||||||
|
file_path,
|
||||||
|
valid_languages,
|
||||||
)
|
)
|
||||||
|
|
||||||
text = "".join(cleaned_pages)
|
text = "".join(cleaned_pages)
|
||||||
|
@ -119,8 +119,9 @@ class PseudoLabelGenerator(BaseComponent):
|
|||||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||||
if len(self.devices) > 1:
|
if len(self.devices) > 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||||
f"using the first device {self.devices[0]}."
|
self.__class__.__name__,
|
||||||
|
self.devices[0],
|
||||||
)
|
)
|
||||||
|
|
||||||
self.retriever = retriever
|
self.retriever = retriever
|
||||||
|
@ -106,7 +106,7 @@ class PreProcessor(BasePreProcessor):
|
|||||||
try:
|
try:
|
||||||
nltk.download("punkt")
|
nltk.download("punkt")
|
||||||
except FileExistsError as error:
|
except FileExistsError as error:
|
||||||
logger.debug(f"NLTK punkt tokenizer seems to be already downloaded. Error message: {error}")
|
logger.debug("NLTK punkt tokenizer seems to be already downloaded. Error message: %s", error)
|
||||||
pass
|
pass
|
||||||
self.clean_whitespace = clean_whitespace
|
self.clean_whitespace = clean_whitespace
|
||||||
self.clean_header_footer = clean_header_footer
|
self.clean_header_footer = clean_header_footer
|
||||||
@ -747,14 +747,16 @@ class PreProcessor(BasePreProcessor):
|
|||||||
# NLTK failed to load custom SentenceTokenizer, fallback to the default model or to English
|
# NLTK failed to load custom SentenceTokenizer, fallback to the default model or to English
|
||||||
if language_name is not None:
|
if language_name is not None:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"PreProcessor couldn't find custom sentence tokenizer model for {self.language}. "
|
"PreProcessor couldn't find custom sentence tokenizer model for %s. Using default %s model.",
|
||||||
f"Using default {self.language} model."
|
self.language,
|
||||||
|
self.language,
|
||||||
)
|
)
|
||||||
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/{language_name}.pickle")
|
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/{language_name}.pickle")
|
||||||
else:
|
else:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"PreProcessor couldn't find default or custom sentence tokenizer model for {self.language}. "
|
"PreProcessor couldn't find default or custom sentence tokenizer model for %s. "
|
||||||
f"Using English instead."
|
"Using English instead.",
|
||||||
|
self.language,
|
||||||
)
|
)
|
||||||
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/english.pickle")
|
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/english.pickle")
|
||||||
|
|
||||||
@ -763,8 +765,9 @@ class PreProcessor(BasePreProcessor):
|
|||||||
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/{language_name}.pickle")
|
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/{language_name}.pickle")
|
||||||
else:
|
else:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"PreProcessor couldn't find the default sentence tokenizer model for {self.language}. "
|
"PreProcessor couldn't find the default sentence tokenizer model for %s. "
|
||||||
f" Using English instead. You may train your own model and use the 'tokenizer_model_folder' parameter."
|
" Using English instead. You may train your own model and use the 'tokenizer_model_folder' parameter.",
|
||||||
|
self.language,
|
||||||
)
|
)
|
||||||
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/english.pickle")
|
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/english.pickle")
|
||||||
|
|
||||||
|
@ -125,8 +125,11 @@ class PromptTemplate(BasePromptTemplate, ABC):
|
|||||||
if args:
|
if args:
|
||||||
if len(args) != len(self.prompt_params):
|
if len(args) != len(self.prompt_params):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"For {self.name}, expected {self.prompt_params} arguments, instead "
|
"For %s, expected %s arguments, instead got %s arguments %s",
|
||||||
f"got {len(args)} arguments {args}"
|
self.name,
|
||||||
|
self.prompt_params,
|
||||||
|
len(args),
|
||||||
|
args,
|
||||||
)
|
)
|
||||||
for prompt_param, arg in zip(self.prompt_params, args):
|
for prompt_param, arg in zip(self.prompt_params, args):
|
||||||
template_dict[prompt_param] = [arg] if isinstance(arg, str) else arg
|
template_dict[prompt_param] = [arg] if isinstance(arg, str) else arg
|
||||||
@ -229,8 +232,9 @@ class HFLocalInvocationLayer(PromptModelInvocationLayer):
|
|||||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||||
if len(self.devices) > 1:
|
if len(self.devices) > 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||||
f"using the first device {self.devices[0]}."
|
self.__class__.__name__,
|
||||||
|
self.devices[0],
|
||||||
)
|
)
|
||||||
|
|
||||||
# Due to reflective construction of all invocation layers we might receive some
|
# Due to reflective construction of all invocation layers we might receive some
|
||||||
|
@ -100,8 +100,9 @@ class TransformersQueryClassifier(BaseQueryClassifier):
|
|||||||
resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||||
if len(resolved_devices) > 1:
|
if len(resolved_devices) > 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||||
f"using the first device {resolved_devices[0]}."
|
self.__class__.__name__,
|
||||||
|
resolved_devices[0],
|
||||||
)
|
)
|
||||||
|
|
||||||
self.model = pipeline(
|
self.model = pipeline(
|
||||||
|
@ -81,8 +81,9 @@ class QuestionGenerator(BaseComponent):
|
|||||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||||
if len(self.devices) > 1:
|
if len(self.devices) > 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||||
f"using the first device {self.devices[0]}."
|
self.__class__.__name__,
|
||||||
|
self.devices[0],
|
||||||
)
|
)
|
||||||
self.model = AutoModelForSeq2SeqLM.from_pretrained(
|
self.model = AutoModelForSeq2SeqLM.from_pretrained(
|
||||||
model_name_or_path, revision=model_version, use_auth_token=use_auth_token
|
model_name_or_path, revision=model_version, use_auth_token=use_auth_token
|
||||||
|
@ -1023,9 +1023,10 @@ class FARMReader(BaseReader):
|
|||||||
|
|
||||||
if self.top_k_per_candidate != 4:
|
if self.top_k_per_candidate != 4:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Performing Evaluation using top_k_per_candidate = {self.top_k_per_candidate} \n"
|
"Performing Evaluation using top_k_per_candidate = %s \n"
|
||||||
f"and consequently, QuestionAnsweringPredictionHead.n_best = {self.top_k_per_candidate + 1}. \n"
|
"and consequently, QuestionAnsweringPredictionHead.n_best = {self.top_k_per_candidate + 1}. \n"
|
||||||
f"This deviates from FARM's default where QuestionAnsweringPredictionHead.n_best = 5"
|
"This deviates from FARM's default where QuestionAnsweringPredictionHead.n_best = 5",
|
||||||
|
self.top_k_per_candidate,
|
||||||
)
|
)
|
||||||
|
|
||||||
# extract all questions for evaluation
|
# extract all questions for evaluation
|
||||||
@ -1062,7 +1063,7 @@ class FARMReader(BaseReader):
|
|||||||
continue
|
continue
|
||||||
if label.answer.offsets_in_document is None:
|
if label.answer.offsets_in_document is None:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Label.answer.offsets_in_document was None, but Span object was expected: {label} "
|
"Label.answer.offsets_in_document was None, but Span object was expected: %s ", label
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
# add to existing answers
|
# add to existing answers
|
||||||
@ -1074,7 +1075,11 @@ class FARMReader(BaseReader):
|
|||||||
# Hack to fix problem where duplicate questions are merged by doc_store processing creating a QA example with 8 annotations > 6 annotation max
|
# Hack to fix problem where duplicate questions are merged by doc_store processing creating a QA example with 8 annotations > 6 annotation max
|
||||||
if len(aggregated_per_question[aggregation_key]["answers"]) >= 6:
|
if len(aggregated_per_question[aggregation_key]["answers"]) >= 6:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Answers in this sample are being dropped because it has more than 6 answers. (doc_id: {doc_id}, question: {label.query}, label_id: {label.id})"
|
"Answers in this sample are being dropped because it has more than 6 answers. "
|
||||||
|
"(doc_id: %s, question: %s, label_id: %s)",
|
||||||
|
doc_id,
|
||||||
|
label.query,
|
||||||
|
label.id,
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
aggregated_per_question[aggregation_key]["answers"].append(
|
aggregated_per_question[aggregation_key]["answers"].append(
|
||||||
|
@ -116,8 +116,9 @@ class TableReader(BaseReader):
|
|||||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||||
if len(self.devices) > 1:
|
if len(self.devices) > 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||||
f"using the first device {self.devices[0]}."
|
self.__class__.__name__,
|
||||||
|
self.devices[0],
|
||||||
)
|
)
|
||||||
|
|
||||||
config = TapasConfig.from_pretrained(model_name_or_path, use_auth_token=use_auth_token)
|
config = TapasConfig.from_pretrained(model_name_or_path, use_auth_token=use_auth_token)
|
||||||
@ -646,8 +647,9 @@ class RCIReader(BaseReader):
|
|||||||
self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False)
|
self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False)
|
||||||
if len(self.devices) > 1:
|
if len(self.devices) > 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||||
f"using the first device {self.devices[0]}."
|
self.__class__.__name__,
|
||||||
|
self.devices[0],
|
||||||
)
|
)
|
||||||
|
|
||||||
self.row_model = AutoModelForSequenceClassification.from_pretrained(
|
self.row_model = AutoModelForSequenceClassification.from_pretrained(
|
||||||
|
@ -86,8 +86,9 @@ class TransformersReader(BaseReader):
|
|||||||
|
|
||||||
if len(self.devices) > 1:
|
if len(self.devices) > 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||||
f"using the first device {self.devices[0]}."
|
self.__class__.__name__,
|
||||||
|
self.devices[0],
|
||||||
)
|
)
|
||||||
|
|
||||||
self.model = pipeline(
|
self.model = pipeline(
|
||||||
|
@ -104,14 +104,18 @@ class _BaseEmbeddingEncoder:
|
|||||||
|
|
||||||
if model_similarity is not None and document_store.similarity != model_similarity:
|
if model_similarity is not None and document_store.similarity != model_similarity:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"You seem to be using {model_name} model with the {document_store.similarity} function instead of the recommended {model_similarity}. "
|
"You seem to be using %s model with the %s function instead of the recommended %s. "
|
||||||
f"This can be set when initializing the DocumentStore"
|
"This can be set when initializing the DocumentStore",
|
||||||
|
model_name,
|
||||||
|
document_store.similarity,
|
||||||
|
model_similarity,
|
||||||
)
|
)
|
||||||
elif "dpr" in model_name.lower() and document_store.similarity != "dot_product":
|
elif "dpr" in model_name.lower() and document_store.similarity != "dot_product":
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"You seem to be using a DPR model with the {document_store.similarity} function. "
|
"You seem to be using a DPR model with the %s function. "
|
||||||
f"We recommend using dot_product instead. "
|
"We recommend using dot_product instead. "
|
||||||
f"This can be set when initializing the DocumentStore"
|
"This can be set when initializing the DocumentStore",
|
||||||
|
document_store.similarity,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -245,9 +245,8 @@ class BaseRetriever(BaseComponent):
|
|||||||
mean_avg_precision = summed_avg_precision / number_of_questions
|
mean_avg_precision = summed_avg_precision / number_of_questions
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
(
|
"For {} out of {} questions ({:.2%}), the answer was in the top-{} candidate passages selected by the retriever.".format(
|
||||||
f"For {correct_retrievals} out of {number_of_questions} questions ({recall:.2%}), the answer was in"
|
correct_retrievals, number_of_questions, recall, top_k
|
||||||
f" the top-{top_k} candidate passages selected by the retriever."
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -178,9 +178,10 @@ class DensePassageRetriever(DenseRetriever):
|
|||||||
|
|
||||||
if document_store and document_store.similarity != "dot_product":
|
if document_store and document_store.similarity != "dot_product":
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"You are using a Dense Passage Retriever model with the {document_store.similarity} function. "
|
"You are using a Dense Passage Retriever model with the %s function. "
|
||||||
"We recommend you use dot_product instead. "
|
"We recommend you use dot_product instead. "
|
||||||
"This can be set when initializing the DocumentStore"
|
"This can be set when initializing the DocumentStore",
|
||||||
|
document_store.similarity,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Init & Load Encoders
|
# Init & Load Encoders
|
||||||
@ -550,8 +551,9 @@ class DensePassageRetriever(DenseRetriever):
|
|||||||
"""
|
"""
|
||||||
if self.processor.num_hard_negatives != 0:
|
if self.processor.num_hard_negatives != 0:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"'num_hard_negatives' is set to {self.processor.num_hard_negatives}, but inference does "
|
"'num_hard_negatives' is set to %s, but inference does "
|
||||||
f"not require any hard negatives. Setting num_hard_negatives to 0."
|
"not require any hard negatives. Setting num_hard_negatives to 0.",
|
||||||
|
self.processor.num_hard_negatives,
|
||||||
)
|
)
|
||||||
self.processor.num_hard_negatives = 0
|
self.processor.num_hard_negatives = 0
|
||||||
|
|
||||||
@ -1163,8 +1165,9 @@ class TableTextRetriever(DenseRetriever):
|
|||||||
|
|
||||||
if self.processor.num_hard_negatives != 0:
|
if self.processor.num_hard_negatives != 0:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"'num_hard_negatives' is set to {self.processor.num_hard_negatives}, but inference does "
|
"'num_hard_negatives' is set to %s, but inference does "
|
||||||
f"not require any hard negatives. Setting num_hard_negatives to 0."
|
"not require any hard negatives. Setting num_hard_negatives to 0.",
|
||||||
|
self.processor.num_hard_negatives,
|
||||||
)
|
)
|
||||||
self.processor.num_hard_negatives = 0
|
self.processor.num_hard_negatives = 0
|
||||||
|
|
||||||
@ -1532,10 +1535,11 @@ class EmbeddingRetriever(DenseRetriever):
|
|||||||
and model_format != "sentence_transformers"
|
and model_format != "sentence_transformers"
|
||||||
):
|
):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"You seem to be using a Sentence Transformer embedding model but 'model_format' is set to '{self.model_format}'."
|
"You seem to be using a Sentence Transformer embedding model but 'model_format' is set to '%s'."
|
||||||
f" You may need to set model_format='sentence_transformers' to ensure correct loading of model."
|
" You may need to set model_format='sentence_transformers' to ensure correct loading of model."
|
||||||
f"As an alternative, you can let Haystack derive the format automatically by not setting the "
|
"As an alternative, you can let Haystack derive the format automatically by not setting the "
|
||||||
f"'model_format' parameter at all."
|
"'model_format' parameter at all.",
|
||||||
|
self.model_format,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.embedding_encoder = _EMBEDDING_ENCODERS[self.model_format](retriever=self)
|
self.embedding_encoder = _EMBEDDING_ENCODERS[self.model_format](retriever=self)
|
||||||
|
@ -96,8 +96,9 @@ class TransformersSummarizer(BaseSummarizer):
|
|||||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||||
if len(self.devices) > 1:
|
if len(self.devices) > 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
"Multiple devices are not supported in %s} inference, using the first device %s.",
|
||||||
f"using the first device {self.devices[0]}."
|
self.__class__.__name__,
|
||||||
|
self.devices[0],
|
||||||
)
|
)
|
||||||
|
|
||||||
if tokenizer is None:
|
if tokenizer is None:
|
||||||
|
@ -83,8 +83,9 @@ class TransformersTranslator(BaseTranslator):
|
|||||||
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
|
||||||
if len(self.devices) > 1:
|
if len(self.devices) > 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Multiple devices are not supported in {self.__class__.__name__} inference, "
|
"Multiple devices are not supported in %s inference, using the first device %s.",
|
||||||
f"using the first device {self.devices[0]}."
|
self.__class__.__name__,
|
||||||
|
self.devices[0],
|
||||||
)
|
)
|
||||||
|
|
||||||
self.max_seq_len = max_seq_len
|
self.max_seq_len = max_seq_len
|
||||||
|
@ -293,10 +293,13 @@ class Pipeline:
|
|||||||
for document_store in document_stores:
|
for document_store in document_stores:
|
||||||
if document_store["type"] != "DeepsetCloudDocumentStore":
|
if document_store["type"] != "DeepsetCloudDocumentStore":
|
||||||
logger.info(
|
logger.info(
|
||||||
f"In order to be used on Deepset Cloud, component '{document_store['name']}' of type '{document_store['type']}' "
|
"In order to be used on Deepset Cloud, component '%s' of type '%s' "
|
||||||
f"has been automatically converted to type DeepsetCloudDocumentStore. "
|
"has been automatically converted to type DeepsetCloudDocumentStore. "
|
||||||
f"Usually this replacement will result in equivalent pipeline quality. "
|
"Usually this replacement will result in equivalent pipeline quality. "
|
||||||
f"However depending on chosen settings of '{document_store['name']}' differences might occur."
|
"However depending on chosen settings of '%s' differences might occur.",
|
||||||
|
document_store["name"],
|
||||||
|
document_store["type"],
|
||||||
|
document_store["name"],
|
||||||
)
|
)
|
||||||
document_store["type"] = "DeepsetCloudDocumentStore"
|
document_store["type"] = "DeepsetCloudDocumentStore"
|
||||||
document_store["params"] = {}
|
document_store["params"] = {}
|
||||||
@ -784,7 +787,7 @@ class Pipeline:
|
|||||||
|
|
||||||
# crop dataset if `dataset_size` is provided and is valid
|
# crop dataset if `dataset_size` is provided and is valid
|
||||||
if num_documents is not None and 0 < num_documents < len(corpus):
|
if num_documents is not None and 0 < num_documents < len(corpus):
|
||||||
logger.info(f"Cropping dataset from {len(corpus)} to {num_documents} documents")
|
logger.info("Cropping dataset from %s to %s documents", len(corpus), num_documents)
|
||||||
corpus = dict(itertools.islice(corpus.items(), num_documents))
|
corpus = dict(itertools.islice(corpus.items(), num_documents))
|
||||||
# Remove queries that don't contain the remaining documents
|
# Remove queries that don't contain the remaining documents
|
||||||
corpus_ids = set(list(corpus.keys()))
|
corpus_ids = set(list(corpus.keys()))
|
||||||
@ -800,8 +803,9 @@ class Pipeline:
|
|||||||
qrels = qrels_new
|
qrels = qrels_new
|
||||||
elif num_documents is not None and (num_documents < 1 or num_documents > len(corpus)):
|
elif num_documents is not None and (num_documents < 1 or num_documents > len(corpus)):
|
||||||
logging.warning(
|
logging.warning(
|
||||||
f"'num_documents' variable should be lower than corpus length and have a positive value, but it's {num_documents}."
|
"'num_documents' variable should be lower than corpus length and have a positive value, but it's %s."
|
||||||
" Dataset size remains unchanged."
|
" Dataset size remains unchanged.",
|
||||||
|
num_documents,
|
||||||
)
|
)
|
||||||
|
|
||||||
# check index before eval
|
# check index before eval
|
||||||
|
@ -82,7 +82,11 @@ def get_component_definitions(
|
|||||||
param_name = key.replace(env_prefix, "").lower()
|
param_name = key.replace(env_prefix, "").lower()
|
||||||
component_definition["params"][param_name] = value
|
component_definition["params"][param_name] = value
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Param '{param_name}' of component '{name}' overwritten with environment variable '{key}' value '{value}'."
|
"Param '%s' of component '%s' overwritten with environment variable '%s' value '%s'.",
|
||||||
|
param_name,
|
||||||
|
name,
|
||||||
|
key,
|
||||||
|
value,
|
||||||
)
|
)
|
||||||
return component_definitions
|
return component_definitions
|
||||||
|
|
||||||
@ -291,11 +295,13 @@ def validate_schema(pipeline_config: Dict, strict_version_check: bool = False, e
|
|||||||
ok_to_ignore_version = pipeline_version == "ignore" and "rc" in __version__
|
ok_to_ignore_version = pipeline_version == "ignore" and "rc" in __version__
|
||||||
if not ok_to_ignore_version:
|
if not ok_to_ignore_version:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
f"This pipeline is version '{pipeline_version}', but you're using Haystack {__version__}\n"
|
"This pipeline is version '%s', but you're using Haystack %s\n"
|
||||||
"This might cause bugs and unexpected behaviors."
|
"This might cause bugs and unexpected behaviors."
|
||||||
"Please check out the release notes (https://github.com/deepset-ai/haystack/releases/latest), "
|
"Please check out the release notes (https://github.com/deepset-ai/haystack/releases/latest), "
|
||||||
"the documentation (https://haystack.deepset.ai/components/pipelines#yaml-file-definitions) "
|
"the documentation (https://haystack.deepset.ai/components/pipelines#yaml-file-definitions) "
|
||||||
"and fix your configuration accordingly."
|
"and fix your configuration accordingly.",
|
||||||
|
pipeline_version,
|
||||||
|
__version__,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Load the json schema, and create one if it doesn't exist yet
|
# Load the json schema, and create one if it doesn't exist yet
|
||||||
@ -317,7 +323,8 @@ def validate_schema(pipeline_config: Dict, strict_version_check: bool = False, e
|
|||||||
if validation.instance["type"] not in loaded_custom_nodes:
|
if validation.instance["type"] not in loaded_custom_nodes:
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Missing definition for node of type {validation.instance['type']}. Looking into local classes..."
|
"Missing definition for node of type %s. Looking into local classes...",
|
||||||
|
validation.instance["type"],
|
||||||
)
|
)
|
||||||
missing_component_class = BaseComponent.get_subclass(validation.instance["type"])
|
missing_component_class = BaseComponent.get_subclass(validation.instance["type"])
|
||||||
schema = inject_definition_in_schema(node_class=missing_component_class, schema=schema)
|
schema = inject_definition_in_schema(node_class=missing_component_class, schema=schema)
|
||||||
|
@ -1092,8 +1092,10 @@ class EvaluationResult:
|
|||||||
query_answers = answers[answers["multilabel_id"] == multilabel_id]
|
query_answers = answers[answers["multilabel_id"] == multilabel_id]
|
||||||
if answer_metric not in metrics:
|
if answer_metric not in metrics:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"You specified an answer_metric={answer_metric} not available in calculated metrics={metrics.keys()}."
|
"You specified an answer_metric=%s not available in calculated metrics=%s."
|
||||||
f"Skipping collection of worst performing samples."
|
"Skipping collection of worst performing samples.",
|
||||||
|
answer_metric,
|
||||||
|
metrics.keys(),
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
if metrics[answer_metric] <= answer_metric_threshold:
|
if metrics[answer_metric] <= answer_metric_threshold:
|
||||||
@ -1127,8 +1129,10 @@ class EvaluationResult:
|
|||||||
for multilabel_id, metrics in worst_df.iterrows():
|
for multilabel_id, metrics in worst_df.iterrows():
|
||||||
if document_metric not in metrics:
|
if document_metric not in metrics:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"You specified a document_metric={document_metric} not available in calculated metrics={metrics.keys()}."
|
"You specified a document_metric=%s not available in calculated metrics=%s."
|
||||||
f"Skipping collection of worst performing samples."
|
"Skipping collection of worst performing samples.",
|
||||||
|
document_metric,
|
||||||
|
metrics.keys(),
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
if metrics[document_metric] <= document_metric_threshold:
|
if metrics[document_metric] <= document_metric_threshold:
|
||||||
@ -1185,9 +1189,9 @@ class EvaluationResult:
|
|||||||
document_relevance_criterion = answer_scope_to_doc_relevance_crit.get(answer_scope, document_scope)
|
document_relevance_criterion = answer_scope_to_doc_relevance_crit.get(answer_scope, document_scope)
|
||||||
elif answer_scope in answer_scope_to_doc_relevance_crit.keys():
|
elif answer_scope in answer_scope_to_doc_relevance_crit.keys():
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"You specified a non-answer document_scope together with a non-default answer_scope. "
|
"You specified a non-answer document_scope together with a non-default answer_scope. "
|
||||||
f"This may result in inconsistencies between answer and document metrics. "
|
"This may result in inconsistencies between answer and document metrics. "
|
||||||
f"To enforce the same definition of correctness for both, document_scope must be one of {['answer', 'document_id_or_answer']}."
|
"To enforce the same definition of correctness for both, document_scope must be one of 'answer', 'document_id_or_answer'."
|
||||||
)
|
)
|
||||||
|
|
||||||
return document_relevance_criterion # type: ignore[return-value]
|
return document_relevance_criterion # type: ignore[return-value]
|
||||||
|
@ -250,7 +250,11 @@ def _write_telemetry_config():
|
|||||||
# show a log message if telemetry config is written for the first time
|
# show a log message if telemetry config is written for the first time
|
||||||
if not CONFIG_PATH.is_file():
|
if not CONFIG_PATH.is_file():
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by calling disable_telemetry() or by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation page. More information at https://docs.haystack.deepset.ai/docs/telemetry"
|
"Haystack sends anonymous usage data to understand the actual usage and steer dev efforts "
|
||||||
|
"towards features that are most meaningful to users. You can opt-out at anytime by calling "
|
||||||
|
"disable_telemetry() or by manually setting the environment variable "
|
||||||
|
"HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation "
|
||||||
|
"page. More information at https://docs.haystack.deepset.ai/docs/telemetry"
|
||||||
)
|
)
|
||||||
CONFIG_PATH.parents[0].mkdir(parents=True, exist_ok=True)
|
CONFIG_PATH.parents[0].mkdir(parents=True, exist_ok=True)
|
||||||
user_id = _get_or_create_user_id()
|
user_id = _get_or_create_user_id()
|
||||||
|
@ -420,7 +420,11 @@ class IndexClient:
|
|||||||
doc = response.json()
|
doc = response.json()
|
||||||
else:
|
else:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Document {id} could not be fetched from deepset Cloud: HTTP {response.status_code} - {response.reason}\n{response.content.decode()}"
|
"Document %s could not be fetched from deepset Cloud: HTTP %s - %s\n%s",
|
||||||
|
id,
|
||||||
|
response.status_code,
|
||||||
|
response.reason,
|
||||||
|
response.content.decode(),
|
||||||
)
|
)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
@ -625,7 +629,9 @@ class PipelineClient:
|
|||||||
else:
|
else:
|
||||||
logger.info("Pipeline config '%s' is already deployed.", pipeline_config_name)
|
logger.info("Pipeline config '%s' is already deployed.", pipeline_config_name)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Search endpoint for pipeline config '{pipeline_config_name}' is up and running for you under {pipeline_url}"
|
"Search endpoint for pipeline config '%s' is up and running for you under %s",
|
||||||
|
pipeline_config_name,
|
||||||
|
pipeline_url,
|
||||||
)
|
)
|
||||||
if show_curl_message:
|
if show_curl_message:
|
||||||
curl_cmd = (
|
curl_cmd = (
|
||||||
@ -925,7 +931,9 @@ class EvaluationSetClient:
|
|||||||
with open(file_path, "rb") as file:
|
with open(file_path, "rb") as file:
|
||||||
self.client.post(url=target_url, files={"file": (file_path.name, file, mime_type)})
|
self.client.post(url=target_url, files={"file": (file_path.name, file, mime_type)})
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Successfully uploaded evaluation set file {file_path}. You can access it now under evaluation set '{file_path.name}'."
|
"Successfully uploaded evaluation set file %s. You can access it now under evaluation set '%s'.",
|
||||||
|
file_path,
|
||||||
|
file_path.name,
|
||||||
)
|
)
|
||||||
except DeepsetCloudError as e:
|
except DeepsetCloudError as e:
|
||||||
logger.error("Error uploading evaluation set file %s: %s", file_path, e.args)
|
logger.error("Error uploading evaluation set file %s: %s", file_path, e.args)
|
||||||
|
@ -87,8 +87,9 @@ def stop_container(container_name, delete_container=False):
|
|||||||
status = subprocess.run([f"docker stop {container_name}"], shell=True)
|
status = subprocess.run([f"docker stop {container_name}"], shell=True)
|
||||||
if status.returncode:
|
if status.returncode:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Tried to stop {container_name} but this failed. "
|
"Tried to stop %s but this failed. It is likely that there was no Docker container with the name %s",
|
||||||
f"It is likely that there was no Docker container with the name {container_name}"
|
container_name,
|
||||||
|
container_name,
|
||||||
)
|
)
|
||||||
if delete_container:
|
if delete_container:
|
||||||
status = subprocess.run([f"docker rm {container_name}"], shell=True)
|
status = subprocess.run([f"docker rm {container_name}"], shell=True)
|
||||||
|
@ -61,8 +61,8 @@ def print_answers(results: dict, details: str = "all", max_text_len: Optional[in
|
|||||||
filtered_answers = answers
|
filtered_answers = answers
|
||||||
else:
|
else:
|
||||||
valid_values = ", ".join(fields_to_keep_by_level.keys()) + " and 'all'"
|
valid_values = ", ".join(fields_to_keep_by_level.keys()) + " and 'all'"
|
||||||
logging.warn(f"print_answers received details='{details}', which was not understood. ")
|
logging.warn("print_answers received details='%s', which was not understood. ", details)
|
||||||
logging.warn(f"Valid values are {valid_values}. Using 'all'.")
|
logging.warn("Valid values are %s. Using 'all'.", valid_values)
|
||||||
filtered_answers = answers
|
filtered_answers = answers
|
||||||
|
|
||||||
# Shorten long text fields
|
# Shorten long text fields
|
||||||
|
@ -50,8 +50,11 @@ def retry_with_exponential_backoff(
|
|||||||
|
|
||||||
# Sleep for the delay
|
# Sleep for the delay
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"{e.__class__.__name__ } - {e}, "
|
"%s - %s, retry %s in %s seconds...",
|
||||||
f"retry {function.__name__} in {'{0:.2f}'.format(sleep_time)} seconds..."
|
e.__class__.__name__,
|
||||||
|
e,
|
||||||
|
function.__name__,
|
||||||
|
"{0:.2f}".format(sleep_time),
|
||||||
)
|
)
|
||||||
time.sleep(sleep_time)
|
time.sleep(sleep_time)
|
||||||
|
|
||||||
|
@ -178,7 +178,7 @@ def create_dpr_training_dataset(squad_data: dict, retriever: BaseRetriever, num_
|
|||||||
|
|
||||||
if not hard_negative_ctxs or not positive_ctxs:
|
if not hard_negative_ctxs or not positive_ctxs:
|
||||||
logging.error(
|
logging.error(
|
||||||
f"No retrieved candidates for article {article_title}, with question {question['question']}"
|
"No retrieved candidates for article %s, with question %s", article_title, question["question"]
|
||||||
)
|
)
|
||||||
n_non_added_questions += 1
|
n_non_added_questions += 1
|
||||||
continue
|
continue
|
||||||
|
@ -260,7 +260,6 @@ disable = [
|
|||||||
"too-few-public-methods",
|
"too-few-public-methods",
|
||||||
"raise-missing-from",
|
"raise-missing-from",
|
||||||
"invalid-name",
|
"invalid-name",
|
||||||
"logging-fstring-interpolation",
|
|
||||||
"too-many-locals",
|
"too-many-locals",
|
||||||
"duplicate-code",
|
"duplicate-code",
|
||||||
"too-many-arguments",
|
"too-many-arguments",
|
||||||
|
@ -171,8 +171,9 @@ def export_feedback(
|
|||||||
context = squad_label["paragraphs"][0]["context"]
|
context = squad_label["paragraphs"][0]["context"]
|
||||||
if not context[start : start + len(answer)] == answer:
|
if not context[start : start + len(answer)] == answer:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Skipping invalid squad label as string via offsets "
|
"Skipping invalid squad label as string via offsets ('%s') does not match answer string ('%s') ",
|
||||||
f"('{context[start:start + len(answer)]}') does not match answer string ('{answer}') "
|
context[start : start + len(answer)],
|
||||||
|
answer,
|
||||||
)
|
)
|
||||||
export_data.append(squad_label)
|
export_data.append(squad_label)
|
||||||
|
|
||||||
|
@ -97,22 +97,27 @@ def _format_filters(filters):
|
|||||||
new_filters = {}
|
new_filters = {}
|
||||||
if filters is None:
|
if filters is None:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Request with deprecated filter format ('\"filters\": null'). "
|
"Request with deprecated filter format ('\"filters\": null'). "
|
||||||
f"Remove empty filters from params to be compliant with future versions"
|
"Remove empty filters from params to be compliant with future versions"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
for key, values in filters.items():
|
for key, values in filters.items():
|
||||||
if values is None:
|
if values is None:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Request with deprecated filter format ('{key}: null'). "
|
"Request with deprecated filter format ('%s: null'). "
|
||||||
f"Remove null values from filters to be compliant with future versions"
|
"Remove null values from filters to be compliant with future versions",
|
||||||
|
key,
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not isinstance(values, list):
|
if not isinstance(values, list):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Request with deprecated filter format ('{key}': {values}). "
|
"Request with deprecated filter format ('%s': %s). "
|
||||||
f"Change to '{key}':[{values}]' to be compliant with future versions"
|
"Change to '%s':[%s]' to be compliant with future versions",
|
||||||
|
key,
|
||||||
|
values,
|
||||||
|
key,
|
||||||
|
values,
|
||||||
)
|
)
|
||||||
values = [values]
|
values = [values]
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user