chore: remove f-strings from logs for performance reasons (#3212)

* Use the %s syntax on all debug messages

* Use the %s syntax on some more debug messages

* Use the %s syntax on info messages

* Use the %s syntax on warning messages

* Use the %s syntax on error and exception messages

* mypy

* pylint

* trogger tutorials execution in CI

* trigger tutorials execution on CI

* black

* remove embeddings from repr

* fix Document `__repr__`

* address feedback

* mypy
This commit is contained in:
Sara Zan 2022-09-19 18:18:32 +02:00 committed by GitHub
parent 8fbccbda82
commit dcb132ba59
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
60 changed files with 235 additions and 209 deletions

View File

@ -29,7 +29,7 @@ def disable_and_log(func):
args_as_kwargs = args_to_kwargs(args, func)
parameters = {**args_as_kwargs, **kwargs}
logger.info(f"Input to {func.__name__}: {parameters}")
logger.info("Input to %s: %s", func.__name__, parameters)
return wrapper

View File

@ -200,7 +200,7 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore):
"""
# Check if index_name refers to an alias
if self.client.indices.exists_alias(name=index_name):
logger.debug(f"Index name {index_name} is an alias.")
logger.debug("Index name %s is an alias.", index_name)
# check if the existing index has the embedding field; if not create it
if self.client.indices.exists(index=index_name, headers=headers):
@ -949,7 +949,7 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore):
all_terms_must_match=all_terms_must_match,
)
logger.debug(f"Retriever query: {body}")
logger.debug("Retriever query: %s", body)
result = self.client.search(index=index, body=body, headers=headers)["hits"]["hits"]
documents = [
@ -1086,7 +1086,7 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore):
body.append(headers)
body.append(cur_query_body)
logger.debug(f"Retriever query: {body}")
logger.debug("Retriever query: %s", body)
responses = self.client.msearch(index=index, body=body)
all_documents = []
@ -1287,7 +1287,7 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore):
if excluded_meta_data:
body["_source"] = {"excludes": excluded_meta_data}
logger.debug(f"Retriever query: {body}")
logger.debug("Retriever query: %s", body)
try:
result = self.client.search(index=index, body=body, request_timeout=300, headers=headers)["hits"]["hits"]
if len(result) == 0:
@ -1458,12 +1458,16 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore):
if update_existing_embeddings:
document_count = self.get_document_count(index=index, headers=headers)
logger.info(f"Updating embeddings for all {document_count} docs ...")
else:
document_count = self.get_document_count(
index=index, filters=filters, only_documents_without_embedding=True, headers=headers
)
logger.info(f"Updating embeddings for {document_count} docs without embeddings ...")
logger.info(
"Updating embeddings for all %s docs %s...",
document_count,
"without embeddings" if not update_existing_embeddings else "",
)
result = self._get_all_documents_in_index(
index=index,
@ -1674,7 +1678,7 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore):
def _delete_index(self, index: str):
if self.client.indices.exists(index):
self.client.indices.delete(index=index, ignore=[400, 404])
logger.info(f"Index '{index}' deleted.")
logger.info("Index '%s' deleted.", index)
class ElasticsearchDocumentStore(BaseElasticsearchDocumentStore):

View File

@ -346,7 +346,7 @@ class FAISSDocumentStore(SQLDocumentStore):
logger.warning("Calling DocumentStore.update_embeddings() on an empty index")
return
logger.info(f"Updating embeddings for {document_count} docs...")
logger.info("Updating embeddings for %s docs...", document_count)
vector_id = sum(index.ntotal for index in self.faiss_indexes.values())
result = self._query(
@ -568,7 +568,7 @@ class FAISSDocumentStore(SQLDocumentStore):
)
if index in self.faiss_indexes:
del self.faiss_indexes[index]
logger.info(f"Index '{index}' deleted.")
logger.info("Index '%s' deleted.", index)
super().delete_index(index)
def query_by_embedding(

View File

@ -451,11 +451,10 @@ class InMemoryDocumentStore(BaseDocumentStore):
result = self._query(
index=index, filters=filters, only_documents_without_embedding=not update_existing_embeddings
)
document_count = len(result)
logger.info(f"Updating embeddings for {document_count} docs ...")
logger.info("Updating embeddings for %s docs ...", len(result) if logger.level > logging.DEBUG else 0)
batched_documents = get_batches_from_generator(result, batch_size)
with tqdm(
total=document_count, disable=not self.progress_bar, position=0, unit=" docs", desc="Updating Embedding"
total=len(result), disable=not self.progress_bar, position=0, unit=" docs", desc="Updating Embedding"
) as progress_bar:
for document_batch in batched_documents:
embeddings = retriever.embed_documents(document_batch) # type: ignore
@ -782,7 +781,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
"""
if index in self.indexes:
del self.indexes[index]
logger.info(f"Index '{index}' deleted.")
logger.info("Index '%s' deleted.", index)
def delete_labels(
self,

View File

@ -37,7 +37,7 @@ class InMemoryKnowledgeGraph(BaseKnowledgeGraph):
if index not in self.indexes:
self.indexes[index] = Graph()
else:
logger.warning(f"Index '{index}' is already present.")
logger.warning("Index '%s' is already present.", index)
def delete_index(self, index: Optional[str] = None):
"""
@ -49,7 +49,7 @@ class InMemoryKnowledgeGraph(BaseKnowledgeGraph):
if index in self.indexes:
del self.indexes[index]
logger.info(f"Index '{index}' deleted.")
logger.info("Index '%s' deleted.", index)
def import_from_ttl_file(self, path: Path, index: Optional[str] = None):
"""

View File

@ -313,13 +313,6 @@ class Milvus1DocumentStore(SQLDocumentStore):
index = index or self.index
self._create_collection_and_index_if_not_exist(index)
document_count = self.get_document_count(index=index)
if document_count == 0:
logger.warning("Calling DocumentStore.update_embeddings() on an empty index")
return
logger.info(f"Updating embeddings for {document_count} docs...")
result = self._query(
index=index,
vector_ids=None,
@ -327,6 +320,14 @@ class Milvus1DocumentStore(SQLDocumentStore):
filters=filters,
only_documents_without_embedding=not update_existing_embeddings,
)
document_count = len(result)
if document_count == 0:
logger.warning("Calling DocumentStore.update_embeddings() on an empty index")
return
logger.info("Updating embeddings for %s docs...", document_count)
batched_documents = get_batches_from_generator(result, batch_size)
with tqdm(
total=document_count, disable=not self.progress_bar, position=0, unit=" docs", desc="Updating Embedding"
@ -665,10 +666,10 @@ class Milvus1DocumentStore(SQLDocumentStore):
index = index or self.index
status, collection_info = self.milvus_server.get_collection_stats(collection_name=index)
if not status.OK():
logger.info(f"Failed fetch stats from store ...")
logger.info("Failed fetch stats from store ...")
return list()
logger.debug(f"collection_info = {collection_info}")
logger.debug("collection_info = %s", collection_info)
ids = list()
partition_list = collection_info["partitions"]
@ -679,16 +680,16 @@ class Milvus1DocumentStore(SQLDocumentStore):
status, id_list = self.milvus_server.list_id_in_segment(
collection_name=index, segment_name=segment_name
)
logger.debug(f"{status}: segment {segment_name} has {len(id_list)} vectors ...")
logger.debug("%s: segment %s has %s vectors ...", status, segment_name, len(id_list))
ids.extend(id_list)
if len(ids) == 0:
logger.info(f"No documents in the store ...")
logger.info("No documents in the store ...")
return list()
status, vectors = self.milvus_server.get_entity_by_id(collection_name=index, ids=ids)
if not status.OK():
logger.info(f"Failed fetch document for ids {ids} from store ...")
logger.info("Failed fetch document for ids %s from store ...", ids)
return list()
return vectors

View File

@ -204,7 +204,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
for field in custom_fields:
if field.name == self.id_field or field.name == self.embedding_field:
logger.warning(f"Skipping `{field.name}` as it is similar to `id_field` or `embedding_field`")
logger.warning("Skipping '%s' as it is similar to 'id_field' or 'embedding_field'", field.name)
else:
fields.append(field)
@ -353,7 +353,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
logger.warning("Calling DocumentStore.update_embeddings() on an empty index")
return
logger.info(f"Updating embeddings for {document_count} docs...")
logger.info("Updating embeddings for %s docs...", document_count)
result = self._query(
index=index,
@ -516,7 +516,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
def _delete_index(self, index: str):
if utility.has_collection(collection_name=index):
utility.drop_collection(collection_name=index)
logger.info(f"Index '{index}' deleted.")
logger.info("Index '%s' deleted.", index)
super().delete_index(index)
def get_all_documents_generator(

View File

@ -379,7 +379,7 @@ class OpenSearchDocumentStore(BaseElasticsearchDocumentStore):
if excluded_meta_data:
body["_source"] = {"excludes": excluded_meta_data}
logger.debug(f"Retriever query: {body}")
logger.debug("Retriever query: %s", body)
result = self.client.search(index=index, body=body, request_timeout=300, headers=headers)["hits"]["hits"]
documents = [
@ -396,7 +396,7 @@ class OpenSearchDocumentStore(BaseElasticsearchDocumentStore):
"""
# Check if index_name refers to an alias
if self.client.indices.exists_alias(name=index_name):
logger.debug(f"Index name {index_name} is an alias.")
logger.debug("Index name %s is an alias.", index_name)
# check if the existing index has the embedding field; if not create it
if self.client.indices.exists(index=index_name, headers=headers):

View File

@ -216,7 +216,7 @@ class PineconeDocumentStore(BaseDocumentStore):
stats = index_connection.describe_index_stats()
dims = stats["dimension"]
count = stats["namespaces"][""]["vector_count"] if stats["namespaces"].get("") else 0
logger.info(f"Index statistics: name: {index}, embedding dimensions: {dims}, record count: {count}")
logger.info("Index statistics: name: %s embedding dimensions: %s, record count: %s", index, dims, count)
# return index connection
return index_connection
@ -471,7 +471,7 @@ class PineconeDocumentStore(BaseDocumentStore):
logger.warning("Calling DocumentStore.update_embeddings() on an empty index")
return
logger.info(f"Updating embeddings for {document_count} docs...")
logger.info("Updating embeddings for %s docs...", document_count)
# If the embedding namespace is empty or the user does not want to update existing embeddings, we use document namespace
if self.get_embedding_count(index=index) == 0 or not update_existing_embeddings:
@ -1003,7 +1003,7 @@ class PineconeDocumentStore(BaseDocumentStore):
index = self._index_name(index)
if index in pinecone.list_indexes():
pinecone.delete_index(index)
logger.info(f"Index '{index}' deleted.")
logger.info("Index '%s' deleted.", index)
if index in self.pinecone_indexes:
del self.pinecone_indexes[index]
if index in self.all_ids:

View File

@ -402,7 +402,7 @@ class SQLDocumentStore(BaseDocumentStore):
try:
meta_orms.append(MetaDocumentORM(name=key, value=value))
except TypeError as ex:
logger.error(f"Document {doc.id} - {ex}")
logger.error("Document %s - %s", doc.id, ex)
doc_mapping = {
"id": doc.id,
"content": doc.to_dict()["content"],
@ -425,7 +425,7 @@ class SQLDocumentStore(BaseDocumentStore):
try:
self.session.commit()
except Exception as ex:
logger.error(f"Transaction rollback: {ex.__cause__}")
logger.error("Transaction rollback: %s", ex.__cause__)
# Rollback is important here otherwise self.session will be in inconsistent state and next call will fail
self.session.rollback()
raise ex
@ -495,7 +495,7 @@ class SQLDocumentStore(BaseDocumentStore):
try:
self.session.commit()
except Exception as ex:
logger.error(f"Transaction rollback: {ex.__cause__}")
logger.error("Transaction rollback: %s", ex.__cause__)
self.session.rollback()
raise ex

View File

@ -34,7 +34,7 @@ def eval_data_from_json(
with open(filename, "r", encoding="utf-8") as file:
data = json.load(file)
if "title" not in data["data"][0]:
logger.warning(f"No title information found for documents in QA file: {filename}")
logger.warning("No title information found for documents in QA file: %s", filename)
for squad_document in data["data"]:
if max_docs:

View File

@ -313,7 +313,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
try:
result = self.weaviate_client.data_object.get_by_id(id, with_vector=True)
except weaviate.exceptions.UnexpectedStatusCodeException as usce:
logging.debug(f"Weaviate could not get the document requested: {usce}")
logging.debug("Weaviate could not get the document requested: %s", usce)
if result:
document = self._convert_weaviate_result_to_document(result, return_embedding=True)
return document
@ -340,7 +340,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
try:
result = self.weaviate_client.data_object.get_by_id(id, with_vector=True)
except weaviate.exceptions.UnexpectedStatusCodeException as usce:
logging.debug(f"Weaviate could not get the document requested: {usce}")
logging.debug("Weaviate could not get the document requested: %s", usce)
if result:
document = self._convert_weaviate_result_to_document(result, return_embedding=True)
documents.append(document)
@ -560,7 +560,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
and "error" in result["result"]["errors"]
):
for message in result["result"]["errors"]["error"]:
logger.error(f"{message['message']}")
logger.error(message["message"])
progress_bar.update(batch_size)
progress_bar.close()
@ -1215,7 +1215,10 @@ class WeaviateDocumentStore(BaseDocumentStore):
raise RuntimeError("Specify the arg `embedding_field` when initializing WeaviateDocumentStore()")
if update_existing_embeddings:
logger.info(f"Updating embeddings for all {self.get_document_count(index=index)} docs ...")
logger.info(
"Updating embeddings for all %s docs ...",
self.get_document_count(index=index) if logger.level > logging.DEBUG else 0,
)
else:
raise RuntimeError(
"All the documents in Weaviate store have an embedding by default. Only update is allowed!"
@ -1379,7 +1382,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
index = self._sanitize_index_name(index) or index
if any(c for c in self.weaviate_client.schema.get()["classes"] if c["class"] == index):
self.weaviate_client.schema.delete_class(index)
logger.info(f"Index '{index}' deleted.")
logger.info("Index '%s' deleted.", index)
def delete_labels(self):
"""

View File

@ -221,7 +221,7 @@ class DataSilo:
"""
Load serialized dataset from a cache.
"""
logger.info(f"Loading datasets from cache at {cache_dir}")
logger.info("Loading datasets from cache at %s", cache_dir)
self.data["train"] = torch.load(cache_dir / "train_dataset")
dev_dataset_path = cache_dir / "dev_dataset"
@ -277,7 +277,7 @@ class DataSilo:
torch.save(self.data["test"], cache_dir / "test_dataset")
torch.save(self.tensor_names, cache_dir / "tensor_names")
logger.info(f"Cached the datasets at {cache_dir}")
logger.info("Cached the datasets at %s", cache_dir)
def _initialize_data_loaders(self):
"""

View File

@ -63,7 +63,7 @@ def convert_features_to_dataset(features):
)
except:
logger.debug(
f"Could not determine type for feature '{t_name}'. " "Converting now to a tensor of default type long."
"Could not determine type for feature '%s'. Converting now to a tensor of default type long.", t_name
)
# Convert all remaining python objects to torch long tensors

View File

@ -147,7 +147,7 @@ class Processor(ABC):
sig = signature(cls.subclasses[processor_name])
unused_args = {k: v for k, v in kwargs.items() if k not in sig.parameters}
logger.debug(
f"Got more parameters than needed for loading {processor_name}: {unused_args}. " f"Those won't be used!"
"Got more parameters than needed for loading %s: %s. Those won't be used!", processor_name, unused_args
)
processor = cls.subclasses[processor_name](
data_dir=data_dir,
@ -324,7 +324,9 @@ class Processor(ABC):
if problematic_sample_ids:
n_problematic = len(problematic_sample_ids)
problematic_id_str = ", ".join([str(i) for i in problematic_sample_ids])
logger.error(f"Unable to convert {n_problematic} samples to features. Their ids are : {problematic_id_str}")
logger.error(
"Unable to convert %s samples to features. Their ids are : %s", n_problematic, problematic_id_str
)
@staticmethod
def _check_sample_features(basket: SampleBasket):
@ -348,7 +350,7 @@ class Processor(ABC):
return True
def _log_samples(self, n_samples: int, baskets: List[SampleBasket]):
logger.debug("*** Show {} random examples ***".format(n_samples))
logger.debug("*** Show %s random examples ***", n_samples)
if len(baskets) == 0:
logger.debug("*** No samples to show because there are no baskets ***")
return
@ -1817,7 +1819,7 @@ class TextClassificationProcessor(Processor):
self.header = header
self.max_samples = max_samples
self.dev_stratification = dev_stratification
logger.debug(f"Currently no support in Processor for returning problematic ids")
logger.debug("Currently no support in Processor for returning problematic ids")
super(TextClassificationProcessor, self).__init__(
tokenizer=tokenizer,
@ -2141,14 +2143,14 @@ def write_squad_predictions(predictions, out_filename, predictions_filename=None
dev_labels[q["id"]] = q["answers"][0]["text"]
not_included = set(list(dev_labels.keys())) - set(list(predictions_json.keys()))
if len(not_included) > 0:
logger.info(f"There were missing predicitons for question ids: {list(not_included)}")
logger.info("There were missing predicitons for question ids: %s", list(not_included))
for x in not_included:
predictions_json[x] = ""
# os.makedirs("model_output", exist_ok=True)
# filepath = Path("model_output") / out_filename
json.dump(predictions_json, open(out_filename, "w"))
logger.info(f"Written Squad predictions to: {out_filename}")
logger.info("Written Squad predictions to: %s", out_filename)
def _read_dpr_json(
@ -2188,7 +2190,7 @@ def _read_dpr_json(
"""
# get remote dataset if needed
if not os.path.exists(file):
logger.info(f" Couldn't find {file} locally. Trying to download ...")
logger.info("Couldn't find %s locally. Trying to download ...", file)
_download_extract_downstream_data(file, proxies=proxies)
if Path(file).suffix.lower() == ".jsonl":
@ -2250,7 +2252,7 @@ def _read_dpr_json(
def _read_squad_file(filename: str, proxies=None):
"""Read a SQuAD json file"""
if not os.path.exists(filename):
logger.info(f" Couldn't find {filename} locally. Trying to download ...")
logger.info("Couldn't find %s locally. Trying to download ...", filename)
_download_extract_downstream_data(filename, proxies)
with open(filename, "r", encoding="utf-8") as reader:
input_data = json.load(reader)["data"]

View File

@ -112,10 +112,10 @@ class Evaluator:
for head_num, head in enumerate(model.prediction_heads):
if head.model_type == "span_classification" and calibrate_conf_scores:
temperature_previous = head.temperature_for_confidence.item()
logger.info(f"temperature used for confidence scores before calibration: {temperature_previous}")
logger.info("temperature used for confidence scores before calibration: %s", temperature_previous)
head.calibrate_conf(logits_all[head_num], label_all[head_num])
temperature_current = head.temperature_for_confidence.item()
logger.info(f"temperature used for confidence scores after calibration: {temperature_current}")
logger.info("temperature used for confidence scores after calibration: %s", temperature_current)
temperature_change = (abs(temperature_current - temperature_previous) / temperature_previous) * 100.0
if temperature_change > 50:
logger.warning(

View File

@ -283,7 +283,7 @@ class Inferencer:
else:
num_processes = mp.cpu_count()
self.process_pool = mp.Pool(processes=num_processes)
logger.info(f"Got ya {num_processes} parallel workers to do inference ...")
logger.info("Got ya %s parallel workers to do inference ...", num_processes)
log_ascii_workers(n=num_processes, logger=logger)
def close_multiprocessing_pool(self, join: bool = False):

View File

@ -142,7 +142,7 @@ class BaseAdaptiveModel:
"does not currently support saving and loading"
)
assert len(model_files) == len(config_files), error_str
logger.info(f"Found files for loading {len(model_files)} prediction heads")
logger.info("Found files for loading %s prediction heads", len(model_files))
return model_files, config_files
@ -573,7 +573,7 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
try:
tracker.track_params(params)
except Exception as e:
logger.warning(f"ML logging didn't work: {e}")
logger.warning("ML logging didn't work: %s", e)
def verify_vocab_size(self, vocab_size: int):
"""

View File

@ -376,7 +376,7 @@ class BiAdaptiveModel(nn.Module):
try:
tracker.track_params(params)
except Exception as e:
logger.warning(f"ML logging didn't work: {e}")
logger.warning("ML logging didn't work: %s", e)
def verify_vocab_size(self, vocab_size1: int, vocab_size2: int):
"""

View File

@ -488,7 +488,7 @@ class HFLanguageModelNoSegmentIds(HFLanguageModelWithPooler):
specified using the arguments `output_hidden_states` and `output_attentions`.
"""
if segment_ids is not None:
logging.warning(f"`segment_ids` is not None, but {self.name} does not use them. They will be ignored.")
logger.warning(f"'segment_ids' is not None, but %s does not use them. They will be ignored.", self.name)
return super().forward(
input_ids=input_ids,
@ -906,7 +906,7 @@ def get_language_model(
"Ensure that the model class name can be inferred from the directory name "
"when loading a Transformers model."
)
logger.error(f"Using the AutoModel class for '{pretrained_model_name_or_path}'. This can cause crashes!")
logger.error("Using the AutoModel class for '%s'. This can cause crashes!", pretrained_model_name_or_path)
model_type = "Auto"
# Find the class corresponding to this model type
@ -918,7 +918,9 @@ def get_language_model(
f"Supported model types are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}"
)
logger.info(f" * LOADING MODEL: '{pretrained_model_name_or_path}' {'(' + model_type + ')' if model_type else ''}")
logger.info(
" * LOADING MODEL: '%s' %s", pretrained_model_name_or_path, "(" + model_type + ")" if model_type else ""
)
# Instantiate the class for this model
language_model = language_model_class(
@ -930,8 +932,10 @@ def get_language_model(
model_kwargs=model_kwargs,
)
logger.info(
f"Loaded '{pretrained_model_name_or_path}' ({model_type} model) "
f"from {'local file system' if config_file_exists else 'model hub'}."
"Loaded '%s' (%s model) from %s.",
pretrained_model_name_or_path,
model_type,
"local file system" if config_file_exists else "model hub",
)
return language_model
@ -963,7 +967,7 @@ def _get_model_type(
model_type = config.architectures[0] if is_supported_model(config.architectures[0]) else None
except Exception as e:
logger.error(f"AutoConfig failed to load on '{model_name_or_path}': {str(e)}")
logger.error("AutoConfig failed to load on '%s': %s", model_name_or_path, e)
if not model_type:
logger.warning("Could not infer the model type from its config. Looking for clues in the model name.")
@ -991,5 +995,5 @@ def _guess_language(name: str) -> str:
language = languages[0]
else:
language = "english"
logger.info(f"Auto-detected model language: {language}")
logger.info("Auto-detected model language: %s", language)
return language

View File

@ -188,7 +188,7 @@ def _get_optim(model, opts: Dict):
optimizer_name = opts.pop("name", None)
# Logging
logger.info(f"Loading optimizer `{optimizer_name}`: '{opts}'")
logger.info("Loading optimizer '%s': %s", optimizer_name, opts)
tracker.track_params(opts)
tracker.track_params({"optimizer_name": optimizer_name})
@ -271,7 +271,7 @@ def get_scheduler(optimizer, opts):
except AttributeError:
raise AttributeError(f"Scheduler '{schedule_name}' not found in 'torch' or 'transformers'")
logger.info(f"Using scheduler '{schedule_name}'")
logger.info("Using scheduler '%s'", schedule_name)
# get supported args of constructor
allowed_args = inspect.signature(sched_constructor).parameters.keys()
@ -285,7 +285,7 @@ def get_scheduler(optimizer, opts):
constructor_opts = {k: v for k, v in opts.items() if k in allowed_args}
# Logging
logger.info(f"Loading schedule `{schedule_name}`: '{constructor_opts}'")
logger.info("Loading schedule '%s': '%s'", schedule_name, constructor_opts)
tracker.track_params(constructor_opts)
tracker.track_params({"schedule_name": schedule_name})

View File

@ -265,7 +265,7 @@ class QuestionAnsweringHead(PredictionHead):
self.layer_dims = layer_dims
assert self.layer_dims[-1] == 2
self.feed_forward = FeedForwardBlock(self.layer_dims)
logger.debug(f"Prediction head initialized with size {self.layer_dims}")
logger.debug("Prediction head initialized with size %s", self.layer_dims)
self.num_labels = self.layer_dims[-1]
self.ph_output_type = "per_token_squad"
self.model_type = "span_classification" # predicts start and end token of answer

View File

@ -409,7 +409,7 @@ class TriAdaptiveModel(nn.Module):
try:
tracker.track_params(params)
except Exception as e:
logger.warning(f"ML logging didn't work: {e}")
logger.warning("ML logging didn't work: %s", e)
def verify_vocab_size(self, vocab_size1: int, vocab_size2: int, vocab_size3: int):
"""Verifies that the model fits to the tokenizer vocabulary.

View File

@ -192,9 +192,9 @@ class Trainer:
# when resuming training from a checkpoint, we want to fast forward to the step of the checkpoint
if resume_from_step and step <= resume_from_step:
if step % 10000 == 0:
logger.info(f"Skipping {step} out of {resume_from_step} steps ...")
logger.info("Skipping %s out of %s steps ...", step, resume_from_step)
if resume_from_step == step:
logger.info(f"Finished skipping {resume_from_step} steps ...")
logger.info("Finished skipping %s steps ...", resume_from_step)
resume_from_step = None
else:
continue
@ -385,9 +385,9 @@ class Trainer:
trainer = cls._load_checkpoint(
path=checkpoint_to_load, data_silo=data_silo, model=model, optimizer=optimizer, local_rank=local_rank
)
logging.info(f"Resuming training from the train checkpoint at {checkpoint_to_load} ...")
logging.info("Resuming training from the train checkpoint at %s ...", checkpoint_to_load)
else:
logging.info(f"No train checkpoints found. Starting a new training ...")
logging.info("No train checkpoints found. Starting a new training ...")
trainer = cls(
data_silo=data_silo,
model=model,
@ -447,7 +447,7 @@ class Trainer:
data_silo=data_silo, model=model, optimizer=optimizer, lr_schedule=scheduler, **trainer_state_dict
)
logger.info(f"Loaded a train checkpoint from {path}")
logger.info("Loaded a train checkpoint from %s", path)
return trainer
@classmethod
@ -516,7 +516,7 @@ class Trainer:
for cp in saved_checkpoints[self.checkpoints_to_keep :]:
shutil.rmtree(cp)
logger.info(f"Saved a training checkpoint after {checkpoint_name}")
logger.info("Saved a training checkpoint after %s", checkpoint_name)
def _get_state_dict(self):
"""

View File

@ -102,8 +102,9 @@ def initialize_device_settings(
device_to_replace = torch.device("cuda")
devices_to_use = [torch.device("cuda:0") if device == device_to_replace else device for device in devices_to_use]
logger.info(f"Using devices: {', '.join([str(device) for device in devices_to_use]).upper()}")
logger.info(f"Number of GPUs: {n_gpu}")
logger.info(
"Using devices: %s - Number of GPUs: %s", ", ".join([str(device) for device in devices_to_use]).upper(), n_gpu
)
return devices_to_use, n_gpu
@ -135,7 +136,7 @@ def try_get(keys, dictionary):
ret = ret[0]
return ret
except Exception as e:
logger.warning(f"Cannot extract from dict {dictionary} with error: {e}")
logger.warning("Cannot extract from dict %s with error: %s", dictionary, e)
return None

View File

@ -176,7 +176,7 @@ def create_schema_for_node_class(node_class: Type[BaseComponent]) -> Tuple[Dict[
node_name = getattr(node_class, "__name__")
logger.info(f"Creating schema for '{node_name}'")
logger.info("Creating schema for '%s'", node_name)
# Read the relevant init parameters from __init__'s signature
init_method = getattr(node_class, "__init__", None)
@ -400,7 +400,7 @@ def inject_definition_in_schema(node_class: Type[BaseComponent], schema: Dict[st
schema_definition, node_ref = create_schema_for_node_class(node_class)
schema["definitions"].update(schema_definition)
schema["properties"]["components"]["items"]["anyOf"].append(node_ref)
logger.info(f"Added definition for {getattr(node_class, '__name__')}")
logger.info("Added definition for %s", getattr(node_class, "__name__"))
return schema

View File

@ -124,7 +124,7 @@ class RAGenerator(BaseGenerator):
if top_k > self.num_beams:
top_k = self.num_beams
logger.warning(f"top_k value should not be greater than num_beams, hence setting it to {num_beams}")
logger.warning("top_k value should not be greater than num_beams, hence setting it to %s", num_beams)
self.top_k = top_k
@ -238,7 +238,7 @@ class RAGenerator(BaseGenerator):
if top_k > self.num_beams:
top_k = self.num_beams
logger.warning(f"top_k value should not be greater than num_beams, hence setting it to {top_k}")
logger.warning("top_k value should not be greater than num_beams, hence setting it to %s", top_k)
# Flatten the documents so easy to reference
flat_docs_dict = self._flatten_docs(documents)
@ -382,7 +382,7 @@ class Seq2SeqGenerator(BaseGenerator):
if top_k > self.num_beams:
top_k = self.num_beams
logger.warning(f"top_k value should not be greater than num_beams, hence setting it to {num_beams}")
logger.warning("top_k value should not be greater than num_beams, hence setting it to %s", num_beams)
self.top_k = top_k
@ -434,7 +434,7 @@ class Seq2SeqGenerator(BaseGenerator):
if top_k > self.num_beams:
top_k = self.num_beams
logger.warning(f"top_k value should not be greater than num_beams, hence setting it to {top_k}")
logger.warning("top_k value should not be greater than num_beams, hence setting it to %s", top_k)
converter: Callable = Seq2SeqGenerator._get_converter(self.model_name_or_path)
if not converter:

View File

@ -216,9 +216,11 @@ class Crawler(BaseComponent):
file_paths: list = []
is_not_empty = len(list(output_dir.rglob("*"))) > 0
if is_not_empty and not overwrite_existing_files:
logger.info(f"Found data stored in `{output_dir}`. Delete this first if you really want to fetch new data.")
logger.info(
"Found data stored in `%s`. Delete this first if you really want to fetch new data.", output_dir
)
else:
logger.info(f"Fetching from {urls} to `{output_dir}`")
logger.info("Fetching from %s to `%s`", urls, output_dir)
# Start by writing out the initial list of urls
if filter_urls:
@ -278,7 +280,7 @@ class Crawler(BaseComponent):
) -> List[Path]:
paths = []
for link in urls:
logger.info(f"writing contents from `{link}`")
logger.info("writing contents from '%s'", link)
self.driver.get(link)
if loading_wait_time is not None:
time.sleep(loading_wait_time)

View File

@ -37,7 +37,7 @@ class BaseDocumentClassifier(BaseComponent):
results = []
document_ids = [doc.id for doc in results]
logger.debug(f"Classified documents with IDs: {document_ids}")
logger.debug("Classified documents with IDs: %s", document_ids)
# convert back to dicts if we are in an indexing pipeline
if root_node == "File":
@ -54,11 +54,11 @@ class BaseDocumentClassifier(BaseComponent):
if isinstance(documents[0], Document):
document_ids = [doc.id for doc in results]
logger.debug(f"Classified documents with IDs: {document_ids}")
logger.debug("Classified documents with IDs: %s", document_ids)
else:
for doc_list in results:
document_ids = [doc.id for doc in doc_list]
logger.debug(f"Classified documents with IDs: {document_ids}")
logger.debug("Classified documents with IDs: %s", document_ids)
return output, "output_1"

View File

@ -135,7 +135,7 @@ class ImageToTextConverter(BaseConverter):
# remove lines having > 40% of words as digits AND not ending with a period(.)
if remove_numeric_tables:
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
logger.debug(f"Removing line '{line}' from file")
logger.debug("Removing line '%s' from file", line)
continue
cleaned_lines.append(line)

View File

@ -140,7 +140,7 @@ class PDFToTextConverter(BaseConverter):
# remove lines having > 40% of words as digits AND not ending with a period(.)
if remove_numeric_tables:
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
logger.debug(f"Removing line '{line}' from {file_path}")
logger.debug("Removing line '%s' from %s", line, file_path)
continue
cleaned_lines.append(line)
@ -256,7 +256,7 @@ class PDFToTextOCRConverter(BaseConverter):
image.save(temp_img.name)
pages.append(self.image_2_text.convert(file_path=temp_img.name)[0].content)
except Exception as exception:
logger.error(f"File {file_path} has an error \n {exception}")
logger.error("File %s has an error:\n%s", file_path, exception)
raw_text = "\f".join(pages)
document = Document(content=raw_text, meta=meta, id_hash_keys=id_hash_keys)

View File

@ -160,7 +160,7 @@ class TikaConverter(BaseConverter):
# remove lines having > 40% of words as digits AND not ending with a period(.)
if remove_numeric_tables:
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
logger.debug(f"Removing line '{line}' from {file_path}")
logger.debug("Removing line '%s' from %s", line, file_path)
continue
cleaned_lines.append(line)

View File

@ -63,7 +63,7 @@ class TextConverter(BaseConverter):
# remove lines having > 40% of words as digits AND not ending with a period(.)
if remove_numeric_tables:
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
logger.debug(f"Removing line '{line}' from {file_path}")
logger.debug("Removing line '%s' from %s", line, file_path)
continue
cleaned_lines.append(line)

View File

@ -465,7 +465,7 @@ class PreProcessor(BasePreProcessor):
found_footer = self._find_longest_common_ngram(end_of_pages)
if found_footer:
pages = [page.replace(found_footer, "") for page in pages]
logger.debug(f"Removed header '{found_header}' and footer '{found_footer}' in document")
logger.debug("Removed header '%s' and footer '%s' in document", found_header, found_footer)
text = "\f".join(pages)
return text
@ -537,10 +537,10 @@ class PreProcessor(BasePreProcessor):
sentence_tokenizer = nltk.data.load(f"file:{str(tokenizer_model_path)}", format="pickle")
sentences = sentence_tokenizer.tokenize(text)
except LookupError:
logger.exception(f"PreProcessor couldn't load sentence tokenizer from {str(tokenizer_model_path)}")
logger.exception("PreProcessor couldn't load sentence tokenizer from %s", tokenizer_model_path)
except (UnpicklingError, ValueError) as e:
logger.exception(
f"PreProcessor couldn't determine model format of sentence tokenizer at {str(tokenizer_model_path)}."
"PreProcessor couldn't determine model format of sentence tokenizer at %s", tokenizer_model_path
)
if sentences:
return sentences

View File

@ -41,7 +41,7 @@ class BaseRanker(BaseComponent):
results = []
document_ids = [doc.id for doc in results]
logger.debug(f"Retrieved documents with IDs: {document_ids}")
logger.debug("Retrieved documents with IDs: %s", document_ids)
output = {"documents": results}
return output, "output_1"
@ -59,7 +59,7 @@ class BaseRanker(BaseComponent):
for doc_list in results:
document_ids = [doc.id for doc in doc_list]
logger.debug(f"Ranked documents with IDs: {document_ids}")
logger.debug("Ranked documents with IDs: %s", document_ids)
output = {"documents": results}

View File

@ -728,7 +728,7 @@ class FARMReader(BaseReader):
:param directory: Directory where the Reader model should be saved
"""
logger.info(f"Saving reader model to {directory}")
logger.info("Saving reader model to %s", directory)
self.inferencer.model.save(directory)
self.inferencer.processor.save(directory)
@ -1026,7 +1026,7 @@ class FARMReader(BaseReader):
aggregated_per_doc = defaultdict(list)
for label in labels:
if not label.document.id:
logger.error(f"Label does not contain a document id")
logger.error("Label does not contain a document id")
continue
aggregated_per_doc[label.document.id].append(label)
@ -1036,7 +1036,7 @@ class FARMReader(BaseReader):
for doc_id in all_doc_ids:
doc = document_store.get_document_by_id(doc_id, index=doc_index)
if not doc:
logger.error(f"Document with the ID '{doc_id}' is not present in the document store.")
logger.error("Document with the ID '%s' is not present in the document store.", doc_id)
continue
d[str(doc_id)] = {"context": doc.content}
# get all questions / answers
@ -1047,7 +1047,7 @@ class FARMReader(BaseReader):
for label in aggregated_per_doc[doc_id]:
aggregation_key = (doc_id, label.query)
if label.answer is None:
logger.error(f"Label.answer was None, but Answer object was expected: {label} ")
logger.error("Label.answer was None, but Answer object was expected: %s", label)
continue
if label.answer.offsets_in_document is None:
logger.error(

View File

@ -177,13 +177,13 @@ class TableReader(BaseReader):
no_answer_score = 1.0
for document in documents:
if document.content_type != "table":
logger.warning(f"Skipping document with id '{document.id}' in TableReader as it is not of type table.")
logger.warning("Skipping document with id '%s' in TableReader as it is not of type table.", document.id)
continue
table: pd.DataFrame = document.content
if table.shape[0] == 0:
logger.warning(
f"Skipping document with id '{document.id}' in TableReader as it does not contain any rows."
"Skipping document with id '%s' in TableReader as it does not contain any rows.", document.id
)
continue
# Tokenize query and current table
@ -656,13 +656,13 @@ class RCIReader(BaseReader):
answers = []
for document in documents:
if document.content_type != "table":
logger.warning(f"Skipping document with id '{document.id}' in RCIReader as it is not of type table.")
logger.warning("Skipping document with id '%s' in RCIReader as it is not of type table.", document.id)
continue
table: pd.DataFrame = document.content
if table.shape[0] == 0:
logger.warning(
f"Skipping document with id '{document.id}' in RCIReader as it does not contain any rows."
"Skipping document with id '%s' in RCIReader as it does not contain any rows.", document.id
)
continue
table = table.astype(str)

View File

@ -221,7 +221,7 @@ class _SentenceTransformersEmbeddingEncoder(_BaseEmbeddingEncoder):
else:
train_examples.append(InputExample(texts=texts))
logger.info(f"Training/adapting {self.embedding_model} with {len(train_examples)} examples")
logger.info("Training/adapting %s with %s examples", self.embedding_model, len(train_examples))
train_dataloader = DataLoader(train_examples, batch_size=batch_size, drop_last=True, shuffle=True)
train_loss = st_loss.loss(self.embedding_model)

View File

@ -328,7 +328,7 @@ class BaseRetriever(BaseComponent):
query=query, filters=filters, top_k=top_k, index=index, headers=headers, scale_score=scale_score
)
document_ids = [doc.id for doc in documents]
logger.debug(f"Retrieved documents with IDs: {document_ids}")
logger.debug("Retrieved documents with IDs: %s", document_ids)
output = {"documents": documents}
return output, "output_1"
@ -351,13 +351,13 @@ class BaseRetriever(BaseComponent):
if not isinstance(doc, Document):
raise HaystackError(f"doc was of type {type(doc)}, but expected a Document.")
document_ids.append(doc.id)
logger.debug(f"Retrieved documents with IDs: {document_ids}")
logger.debug("Retrieved documents with IDs: %s", document_ids)
else:
for doc_list in documents:
if not isinstance(doc_list, list):
raise HaystackError(f"doc_list was of type {type(doc_list)}, but expected a list of Documents.")
document_ids = [doc.id for doc in doc_list]
logger.debug(f"Retrieved documents with IDs: {document_ids}")
logger.debug("Retrieved documents with IDs: %s", document_ids)
output = {"documents": documents}
return output, "output_1"

View File

@ -752,7 +752,7 @@ class DensePassageRetriever(BaseRetriever):
use_fast_tokenizers=use_fast_tokenizers,
similarity_function=similarity_function,
)
logger.info(f"DPR model loaded from {load_dir}")
logger.info("DPR model loaded from %s", load_dir)
return dpr
@ -1435,7 +1435,7 @@ class TableTextRetriever(BaseRetriever):
use_fast_tokenizers=use_fast_tokenizers,
similarity_function=similarity_function,
)
logger.info(f"TableTextRetriever model loaded from {load_dir}")
logger.info("TableTextRetriever model loaded from %s", load_dir)
return mm_retriever
@ -1530,7 +1530,7 @@ class EmbeddingRetriever(BaseRetriever):
else model_format
)
logger.info(f"Init retriever using embeddings of model {embedding_model}")
logger.info("Init retriever using embeddings of model %s", embedding_model)
if self.model_format not in _EMBEDDING_ENCODERS.keys():
raise ValueError(f"Unknown retriever embedding model format {model_format}")

View File

@ -438,7 +438,7 @@ class TfidfRetriever(BaseRetriever):
continue
paragraphs.append(Paragraph(document_id=doc.id, paragraph_id=p_id, content=(p,), meta=doc.meta))
p_id += 1
logger.info(f"Found {len(paragraphs)} candidate paragraphs from {len(documents)} docs in DB")
logger.info("Found %s candidate paragraphs from %s docs in DB", len(paragraphs), len(documents))
return paragraphs
def _calc_scores(self, queries: Union[str, List[str]]) -> List[Dict[int, float]]:
@ -514,7 +514,9 @@ class TfidfRetriever(BaseRetriever):
df_sliced = df_sliced[:top_k]
logger.debug(
f"Identified {df_sliced.shape[0]} candidates via retriever:\n {df_sliced.to_string(col_space=10, index=False)}"
"Identified %s candidates via retriever:\n%s",
df_sliced.shape[0],
df_sliced.to_string(col_space=10, index=False),
)
# get actual content for the top candidates
@ -585,8 +587,9 @@ class TfidfRetriever(BaseRetriever):
df_sliced = self.df.loc[query_result.keys()]
df_sliced = df_sliced[:top_k]
logger.debug(
f"Identified {df_sliced.shape[0]} candidates via retriever:"
f"\n {df_sliced.to_string(col_space=10, index=False)}"
"Identified %s candidates via retriever:\n%s",
df_sliced.shape[0],
df_sliced.to_string(col_space=10, index=False),
)
# get actual content for the top candidates

View File

@ -14,7 +14,6 @@ import json
import inspect
import logging
import tempfile
import traceback
from pathlib import Path
import yaml
@ -296,14 +295,14 @@ class Pipeline:
f"Deployed pipeline configs are not allowed to be updated. Please undeploy pipeline config '{pipeline_config_name}' first."
)
client.update_pipeline_config(config=config, pipeline_config_name=pipeline_config_name)
logger.info(f"Pipeline config '{pipeline_config_name}' successfully updated.")
logger.info("Pipeline config '%s' successfully updated.", pipeline_config_name)
else:
raise ValueError(
f"Pipeline config '{pipeline_config_name}' already exists. Set `overwrite=True` to overwrite pipeline config."
)
else:
client.save_pipeline_config(config=config, pipeline_config_name=pipeline_config_name)
logger.info(f"Pipeline config '{pipeline_config_name}' successfully created.")
logger.info("Pipeline config '%s' successfully created.", pipeline_config_name)
@classmethod
def deploy_on_deepset_cloud(
@ -508,13 +507,15 @@ class Pipeline:
predecessors = set(nx.ancestors(self.graph, node_id))
if predecessors.isdisjoint(set(queue.keys())): # only execute if predecessor nodes are executed
try:
logger.debug(f"Running node `{node_id}` with input `{node_input}`")
logger.debug("Running node '%s` with input: %s", node_id, node_input)
node_output, stream_id = self._run_node(node_id, node_input)
except Exception as e:
tb = traceback.format_exc()
# The input might be a really large object with thousands of embeddings.
# If you really want to see it, raise the log level.
logger.debug("Exception while running node '%s' with input %s", node_id, node_input)
raise Exception(
f"Exception while running node `{node_id}` with input `{node_input}`: {e}, full stack trace: {tb}"
)
f"Exception while running node '{node_id}': {e}\nEnable debug logging to see the data that was passed when the pipeline failed."
) from e
queue.pop(node_id)
#
if stream_id == "split":
@ -660,14 +661,15 @@ class Pipeline:
predecessors = set(nx.ancestors(self.graph, node_id))
if predecessors.isdisjoint(set(queue.keys())): # only execute if predecessor nodes are executed
try:
logger.debug(f"Running node `{node_id}` with input `{node_input}`")
logger.debug("Running node '%s` with input: %s", node_id, node_input)
node_output, stream_id = self.graph.nodes[node_id]["component"]._dispatch_run_batch(**node_input)
except Exception as e:
tb = traceback.format_exc()
# The input might be a really large object with thousands of embeddings.
# If you really want to see it, raise the log level.
logger.debug("Exception while running node '%s' with input %s", node_id, node_input)
raise Exception(
f"Exception while running node `{node_id}` with input `{node_input}`: {e}, "
f"full stack trace: {tb}"
)
f"Exception while running node '{node_id}': {e}\nEnable debug logging to see the data that was passed when the pipeline failed."
) from e
queue.pop(node_id)
if stream_id == "split":
@ -746,7 +748,7 @@ class Pipeline:
url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip"
data_path = util.download_and_unzip(url, dataset_dir)
logger.info(f"Dataset downloaded here: {data_path}")
logger.info("Dataset downloaded here: %s", data_path)
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test") # or split = "train" or "dev"
# check index before eval
@ -776,11 +778,11 @@ class Pipeline:
# Clean up document store
if not keep_index and document_store is not None and document_store.index is not None:
logger.info(f"Cleaning up: deleting index '{document_store.index}'...")
logger.info("Cleaning up: deleting index '%s' ...", document_store.index)
document_store.delete_index(document_store.index)
# Evaluate your retrieval using NDCG@k, MAP@K ...
logger.info(f"Retriever evaluation for k in: {retriever.k_values}")
logger.info("Retriever evaluation for k in: %s", retriever.k_values)
ndcg, map_, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
return ndcg, map_, recall, precision
@ -985,10 +987,10 @@ class Pipeline:
if not reuse_index:
raise HaystackError(f"Index '{document_store.index}' is not empty. Please provide an empty index.")
else:
logger.info(f"indexing {len(corpus_file_paths)} documents...")
logger.info("indexing %s documents...", len(corpus_file_paths))
index_pipeline.run(file_paths=corpus_file_paths, meta=corpus_file_metas, params=index_params)
document_count = document_store.get_document_count()
logger.info(f"indexing {len(evaluation_set_labels)} files to {document_count} documents finished.")
logger.info("indexing %s files to %s documents finished.", len(evaluation_set_labels), document_count)
tracker.track_params({"pipeline_index_document_count": document_count})
@ -1056,7 +1058,7 @@ class Pipeline:
# Clean up document store
if not reuse_index and document_store.index is not None:
logger.info(f"Cleaning up: deleting index '{document_store.index}'...")
logger.info("Cleaning up: deleting index '%s'...", document_store.index)
document_store.delete_index(document_store.index)
finally:
@ -1435,7 +1437,7 @@ class Pipeline:
for i, (query, query_labels) in enumerate(zip(queries, query_labels_per_query)):
if query_labels is None or query_labels.labels is None:
logger.warning(f"There is no label for query '{query}'. Query will be omitted.")
logger.warning("There is no label for query '%s'. Query will be omitted.", query)
continue
# remarks for no_answers:
@ -1925,7 +1927,7 @@ class Pipeline:
component_params = definitions[name].get("params", {})
component_type = definitions[name]["type"]
logger.debug(f"Loading component `{name}` of type `{definitions[name]['type']}`")
logger.debug(f"Loading component '%s' of type '%s'", name, definitions[name]["type"])
for key, value in component_params.items():
# Component params can reference to other components. For instance, a Retriever can reference a
@ -2188,9 +2190,9 @@ class _HaystackBeirRetrieverAdapter:
file_paths.append(file_path)
metas.append({"id": id, "name": doc.get("title", None)})
logger.info(f"indexing {len(corpus)} documents...")
logger.info("indexing %s documents...", len(corpus))
self.index_pipeline.run(file_paths=file_paths, meta=metas, params=self.index_params)
logger.info(f"indexing finished.")
logger.info("indexing finished.")
# adjust query_params to ensure top_k is retrieved
query_params = copy.deepcopy(self.query_params)

View File

@ -208,7 +208,7 @@ def validate_yaml(
"""
pipeline_config = read_pipeline_config_from_yaml(path)
validate_config(pipeline_config=pipeline_config, strict_version_check=strict_version_check, extras=extras)
logging.debug(f"'{path}' contains valid Haystack pipelines.")
logging.debug("'%s' contains valid Haystack pipelines.", path)
def validate_config(
@ -343,7 +343,7 @@ def validate_schema(pipeline_config: Dict, strict_version_check: bool = False, e
f"Validation failed. {validation.message}. {error_location} " "See the stacktrace for more information."
) from validation
logging.debug(f"The given configuration is valid according to the JSON schema.")
logging.debug("The given configuration is valid according to the JSON schema.")
def validate_pipeline_graph(pipeline_definition: Dict[str, Any], component_definitions: Dict[str, Any]):
@ -357,7 +357,7 @@ def validate_pipeline_graph(pipeline_definition: Dict[str, Any], component_defin
graph = _init_pipeline_graph(root_node_name=root_node_name)
for node in pipeline_definition["nodes"]:
graph = _add_node_to_pipeline_graph(graph=graph, node=node, components=component_definitions)
logging.debug(f"The graph for pipeline '{pipeline_definition['name']}' is valid.")
logging.debug("The graph for pipeline '%s' is valid.", pipeline_definition["name"])
def _find_root_in_pipeline_definition(pipeline_definition: Dict[str, Any]):

View File

@ -225,6 +225,9 @@ class Document:
)
def __repr__(self):
values = self.to_dict()
if values.get("embedding", False):
values["embedding"] = f"<embedding of shape {values['embedding'].get('shape', '[no shape]')}>"
return f"<Document: {str(self.to_dict())}>"
def __str__(self):
@ -1389,7 +1392,7 @@ class EvaluationResult:
index=False, quoting=csv.QUOTE_NONNUMERIC (to avoid problems with \r chars)
"""
out_dir = out_dir if isinstance(out_dir, Path) else Path(out_dir)
logger.info(f"Saving evaluation results to {out_dir}")
logger.info("Saving evaluation results to %s", out_dir)
if not out_dir.exists():
out_dir.mkdir(parents=True)
for node_name, df in self.node_results.items():

View File

@ -73,7 +73,7 @@ def enable_writing_events_to_file():
Enables writing each event that is sent to the log file specified in LOG_PATH
"""
os.environ[HAYSTACK_TELEMETRY_LOGGING_TO_FILE_ENABLED] = "True"
logger.info(f"Writing events to log file {LOG_PATH} has been enabled.")
logger.info("Writing events to log file %s has been enabled.", LOG_PATH)
def disable_writing_events_to_file():
@ -81,7 +81,7 @@ def disable_writing_events_to_file():
Disables writing each event that is sent to the log file specified in LOG_PATH
"""
os.environ[HAYSTACK_TELEMETRY_LOGGING_TO_FILE_ENABLED] = "False"
logger.info(f"Writing events to log file {LOG_PATH} has been disabled.")
logger.info("Writing events to log file %s has been disabled.", LOG_PATH)
def is_telemetry_enabled() -> bool:
@ -235,7 +235,7 @@ def _read_telemetry_config():
if "user_id" in config and user_id is None:
user_id = config["user_id"]
except Exception as e:
logger.debug(f"Telemetry was not able to read the config file {CONFIG_PATH}.", exc_info=e)
logger.debug("Telemetry was not able to read the config file %s", CONFIG_PATH, exc_info=e)
def _write_telemetry_config():
@ -257,7 +257,7 @@ def _write_telemetry_config():
with open(CONFIG_PATH, "w") as outfile:
yaml.dump(config, outfile, default_flow_style=False)
except Exception:
logger.debug(f"Could not write config file to {CONFIG_PATH}.")
logger.debug("Could not write config file to %s", CONFIG_PATH)
send_custom_event(event="config saving failed")
@ -266,7 +266,7 @@ def _write_event_to_telemetry_log_file(distinct_id: str, event: str, properties:
with open(LOG_PATH, "a") as file_object:
file_object.write(f"{event}, {properties}, {distinct_id}\n")
except Exception as e:
logger.debug(f"Telemetry was not able to write event to log file {LOG_PATH}.", exc_info=e)
logger.debug("Telemetry was not able to write event to log file %s", LOG_PATH, exc_info=e)
def _delete_telemetry_file(file_type_to_delete: TelemetryFileType):
@ -279,7 +279,7 @@ def _delete_telemetry_file(file_type_to_delete: TelemetryFileType):
try:
path.unlink() # todo add missing_ok=True to the unlink() call when upgrading to python>3.7
except Exception as e:
logger.debug(f"Telemetry was not able to delete the {file_type_to_delete} at {path}.", exc_info=e)
logger.debug("Telemetry was not able to delete the %s at %s", file_type_to_delete, path, exc_info=e)
class NonPrivateParameters:

View File

@ -546,7 +546,7 @@ class PipelineClient:
pipelines_url = f"{workspace_url}/pipelines"
response = self.client.post(url=pipelines_url, data=yaml.dump(config), headers=headers).json()
if "name" not in response or response["name"] != pipeline_config_name:
logger.warning(f"Unexpected response from saving pipeline config: {response}")
logger.warning("Unexpected response from saving pipeline config: %s", response)
def update_pipeline_config(
self,
@ -568,7 +568,7 @@ class PipelineClient:
yaml_url = f"{pipeline_url}/yaml"
response = self.client.put(url=yaml_url, data=yaml.dump(config), headers=headers).json()
if "name" not in response or response["name"] != pipeline_config_name:
logger.warning(f"Unexpected response from updating pipeline config: {response}")
logger.warning("Unexpected response from updating pipeline config: %s", response)
def deploy(
self,
@ -609,9 +609,9 @@ class PipelineClient:
if status == PipelineStatus.DEPLOYED:
if changed:
logger.info(f"Pipeline config '{pipeline_config_name}' successfully deployed.")
logger.info("Pipeline config '%s' successfully deployed.", pipeline_config_name)
else:
logger.info(f"Pipeline config '{pipeline_config_name}' is already deployed.")
logger.info("Pipeline config '%s' is already deployed.", pipeline_config_name)
logger.info(
f"Search endpoint for pipeline config '{pipeline_config_name}' is up and running for you under {pipeline_url}"
)
@ -628,7 +628,7 @@ class PipelineClient:
f" ]\n"
f"}}'"
)
logger.info(f"Try it out using the following curl command:\n{curl_cmd}")
logger.info("Try it out using the following curl command:\n%s", curl_cmd)
elif status == PipelineStatus.DEPLOYMENT_FAILED:
raise DeepsetCloudError(
@ -673,9 +673,9 @@ class PipelineClient:
if status == PipelineStatus.UNDEPLOYED:
if changed:
logger.info(f"Pipeline config '{pipeline_config_name}' successfully undeployed.")
logger.info("Pipeline config '%s' successfully undeployed.", pipeline_config_name)
else:
logger.info(f"Pipeline config '{pipeline_config_name}' is already undeployed.")
logger.info("Pipeline config '%s' is already undeployed.", pipeline_config_name)
elif status in [PipelineStatus.DEPLOYMENT_IN_PROGRESS, PipelineStatus.DEPLOYMENT_SCHEDULED]:
raise DeepsetCloudError(
f"Undeployment of pipeline config '{pipeline_config_name}' aborted. Deployment was requested."
@ -754,7 +754,7 @@ class PipelineClient:
raise DeepsetCloudError(f"Pipeline config '{pipeline_config_name}' does not exist anymore.")
status = PipelineStatus.from_str(pipeline_info["status"])
if status in valid_transitioning_states:
logger.info(f"Current status of '{pipeline_config_name}' is: '{status}'")
logger.info("Current status of '%s' is: '%s'", pipeline_config_name, status)
time.sleep(5)
return status, True
@ -907,7 +907,7 @@ class EvaluationSetClient:
f"Successfully uploaded evaluation set file {file_path}. You can access it now under evaluation set '{file_path.name}'."
)
except DeepsetCloudError as e:
logger.error(f"Error uploading evaluation set file {file_path}: {e.args}")
logger.error("Error uploading evaluation set file %s: %s", file_path, e.args)
def get_evaluation_set(
self, evaluation_set: Optional[str] = None, workspace: Optional[str] = None
@ -993,9 +993,9 @@ class FileClient:
file_id = response_file_upload.json().get("file_id")
file_ids.append(file_id)
except Exception as e:
logger.exception(f"Error uploading file {file_path}")
logger.exception("Error uploading file %s", file_path)
logger.info(f"Successfully uploaded {len(file_ids)} files.")
logger.info("Successfully uploaded %s files.", len(file_ids))
def delete_file(self, file_id: str, workspace: Optional[str] = None, headers: dict = None):
"""
@ -1147,7 +1147,7 @@ class EvaluationRunClient:
eval_run_url = f"{workspace_url}/eval_runs/{eval_run_name}"
response = self.client.delete(eval_run_url, headers=headers)
if response.status_code == 204:
logger.info(f"Evaluation run '{eval_run_name}' deleted.")
logger.info("Evaluation run '%s' deleted.", eval_run_name)
def start_eval_run(self, eval_run_name: str, workspace: Optional[str] = None, headers: dict = None):
"""
@ -1162,7 +1162,7 @@ class EvaluationRunClient:
eval_run_url = f"{workspace_url}/eval_runs/{eval_run_name}/start"
response = self.client.post(eval_run_url, headers=headers)
if response.status_code == 204:
logger.info(f"Evaluation run '{eval_run_name}' has been started.")
logger.info("Evaluation run '%s' has been started.", eval_run_name)
def update_eval_run(
self,

View File

@ -79,7 +79,7 @@ def launch_weaviate(sleep=15):
def stop_container(container_name, delete_container=False):
logger.debug(f"Stopping {container_name}...")
logger.debug("Stopping %s...", container_name)
status = subprocess.run([f"docker stop {container_name}"], shell=True)
if status.returncode:
logger.warning(
@ -117,7 +117,7 @@ def stop_service(document_store, delete_container=False):
elif "WeaviateDocumentStore" in ds_class:
stop_weaviate(delete_container)
else:
logger.warning(f"No support yet for auto stopping the service behind a {type(document_store)}")
logger.warning("No support yet for auto stopping the service behind a %s", type(document_store))
def launch_milvus(sleep=15, delete_existing=False):

View File

@ -28,6 +28,6 @@ def cache_models(models: List[str] = None, use_auth_token: Optional[Union[str, b
import transformers
for model_to_cache in models:
logging.info(f"Caching {model_to_cache}")
logging.info("Caching %s", model_to_cache)
transformers.AutoTokenizer.from_pretrained(model_to_cache, use_auth_token=use_auth_token)
transformers.AutoModel.from_pretrained(model_to_cache, use_auth_token=use_auth_token)

View File

@ -117,19 +117,19 @@ class StdoutTrackingHead(BaseTrackingHead):
def init_experiment(
self, experiment_name: str, run_name: str = None, tags: Dict[str, Any] = None, nested: bool = False
):
logger.info(f"\n **** Starting experiment '{experiment_name}' (Run: {run_name}) ****")
logger.info("\n **** Starting experiment '%s' (Run: %s) ****", experiment_name, run_name)
def track_metrics(self, metrics: Dict[str, Any], step: int):
logger.info(f"Logged metrics at step {step}: \n {metrics}")
logger.info("Logged metrics at step %s: \n %s", step, metrics)
def track_params(self, params: Dict[str, Any]):
logger.info(f"Logged parameters: \n {params}")
logger.info("Logged parameters: \n %s", params)
def track_artifacts(self, dir_path: Union[str, Path], artifact_path: str = None):
logger.warning(f"Cannot log artifacts with StdoutLogger: \n {dir_path}")
logger.warning("Cannot log artifacts with StdoutLogger: \n %s", dir_path)
def end_run(self):
logger.info(f"**** End of Experiment **** ")
logger.info("**** End of Experiment **** ")
class MLflowTrackingHead(BaseTrackingHead):
@ -148,7 +148,9 @@ class MLflowTrackingHead(BaseTrackingHead):
mlflow.set_tracking_uri(self.tracking_uri)
mlflow.set_experiment(experiment_name)
mlflow.start_run(run_name=run_name, nested=nested, tags=tags)
logger.info(f"Tracking run {run_name} of experiment {experiment_name} by mlflow under {self.tracking_uri}")
logger.info(
"Tracking run %s of experiment %s by mlflow under %s", run_name, experiment_name, self.tracking_uri
)
if self.auto_track_environment:
mlflow.log_params(flatten_dict({"environment": get_or_create_env_meta_data()}))
except ConnectionError:
@ -163,9 +165,9 @@ class MLflowTrackingHead(BaseTrackingHead):
metrics = flatten_dict(metrics)
mlflow.log_metrics(metrics, step=step)
except ConnectionError:
logger.warning(f"ConnectionError in logging metrics to MLflow.")
logger.warning("ConnectionError in logging metrics to MLflow.")
except Exception as e:
logger.warning(f"Failed to log metrics: {e}")
logger.warning("Failed to log metrics: %s", e)
def track_params(self, params: Dict[str, Any]):
try:
@ -174,15 +176,15 @@ class MLflowTrackingHead(BaseTrackingHead):
except ConnectionError:
logger.warning("ConnectionError in logging params to MLflow")
except Exception as e:
logger.warning(f"Failed to log params: {e}")
logger.warning("Failed to log params: %s", e)
def track_artifacts(self, dir_path: Union[str, Path], artifact_path: str = None):
try:
mlflow.log_artifacts(dir_path, artifact_path)
except ConnectionError:
logger.warning(f"ConnectionError in logging artifacts to MLflow")
logger.warning("ConnectionError in logging artifacts to MLflow")
except Exception as e:
logger.warning(f"Failed to log artifacts: {e}")
logger.warning("Failed to log artifacts: %s", e)
def end_run(self):
mlflow.end_run()

View File

@ -81,10 +81,10 @@ def fetch_archive_from_http(url: str, output_dir: str, proxies: Optional[dict] =
is_not_empty = len(list(Path(path).rglob("*"))) > 0
if is_not_empty:
logger.info(f"Found data stored in `{output_dir}`. Delete this first if you really want to fetch new data.")
logger.info("Found data stored in '%s'. Delete this first if you really want to fetch new data.", output_dir)
return False
else:
logger.info(f"Fetching from {url} to `{output_dir}`")
logger.info("Fetching from %s to '%s'", url, output_dir)
_, _, archive_extension = url.rpartition(".")
request_data = requests.get(url, proxies=proxies)

View File

@ -192,8 +192,8 @@ def create_dpr_training_dataset(squad_data: dict, retriever: BaseRetriever, num_
n_questions += 1
yield dict_DPR
logger.info(f"Number of skipped questions: {n_non_added_questions}")
logger.info(f"Number of added questions: {n_questions}")
logger.info("Number of skipped questions: %s", n_non_added_questions)
logger.info("Number of added questions: %s", n_questions)
def save_dataset(iter_dpr: Iterator, dpr_output_filename: Path, total_nb_questions: int, split_dataset: bool):

View File

@ -26,17 +26,17 @@ def setup_pipelines() -> Dict[str, Any]:
# Load query pipeline
query_pipeline = Pipeline.load_from_yaml(Path(config.PIPELINE_YAML_PATH), pipeline_name=config.QUERY_PIPELINE_NAME)
logging.info(f"Loaded pipeline nodes: {query_pipeline.graph.nodes.keys()}")
logging.info("Loaded pipeline nodes: %s", query_pipeline.graph.nodes.keys())
pipelines["query_pipeline"] = query_pipeline
# Find document store
document_store = query_pipeline.get_document_store()
logging.info(f"Loaded docstore: {document_store}")
logging.info("Loaded docstore: %s", document_store)
pipelines["document_store"] = document_store
# Setup concurrency limiter
concurrency_limiter = RequestLimiter(config.CONCURRENT_REQUEST_PER_WORKER)
logging.info("Concurrent requests per worker: {CONCURRENT_REQUEST_PER_WORKER}")
logging.info("Concurrent requests per worker: %s", config.CONCURRENT_REQUEST_PER_WORKER)
pipelines["concurrency_limiter"] = concurrency_limiter
# Load indexing pipeline (if available)
@ -53,7 +53,7 @@ def setup_pipelines() -> Dict[str, Any]:
except PipelineConfigError as e:
indexing_pipeline = None
logger.error(f"{e.message}\nFile Upload API will not be available.")
logger.error("%s\nFile Upload API will not be available.", e.message)
finally:
pipelines["indexing_pipeline"] = indexing_pipeline

View File

@ -122,7 +122,7 @@ def main():
for current_config in combine_config(distillation_settings):
descriptions.append(f"Results of student with distillation (config: {current_config}")
# distillation training
logger.info(f"Training student with distillation (config: {current_config}")
logger.info("Training student with distillation (config: %s)", current_config)
results.append(
train_student_with_distillation(
student, teacher, download_folder, train_file, test_file, **current_config, **training_settings
@ -145,9 +145,9 @@ def main():
logger.info("Evaluation results:")
for result, description in zip(results, descriptions):
logger.info(description)
logger.info(f"EM: {result['EM']}")
logger.info(f"F1: {result['f1']}")
logger.info(f"Top n accuracy: {result['top_n_accuracy']}")
logger.info("EM: %s", result["EM"])
logger.info("F1: %s", result["f1"])
logger.info("Top n accuracy: %s", result["top_n_accuracy"])
if __name__ == "__main__":

View File

@ -51,7 +51,7 @@ def benchmark_reader(ci=False, update_json=False, save_markdown=False, **kwargs)
index_to_doc_store(doc_store, docs, None, labels)
for reader_name in reader_models:
for reader_type in reader_types:
logger.info(f"##### Start reader run - model:{reader_name}, type: {reader_type} ##### ")
logger.info("##### Start reader run - model: %s, type: %s ##### ", reader_name, reader_type)
try:
reader = get_reader(reader_name, reader_type)
results = reader.eval(

View File

@ -56,7 +56,7 @@ def benchmark_indexing(
retriever_results = []
for n_docs in n_docs_options:
for retriever_name, doc_store_name in retriever_doc_stores:
logger.info(f"##### Start indexing run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### ")
logger.info("##### Start indexing run: %s, %s, %s docs ##### ", retriever_name, doc_store_name, n_docs)
try:
doc_store = get_document_store(doc_store_name)
retriever = get_retriever(retriever_name, doc_store, DEVICES)
@ -160,7 +160,7 @@ def benchmark_querying(
for n_docs in n_docs_options:
for retriever_name, doc_store_name in retriever_doc_stores:
try:
logger.info(f"##### Start querying run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### ")
logger.info("##### Start querying run: %s, %s, %s docs ##### ", retriever_name, doc_store_name, n_docs)
if retriever_name in ["elastic", "sentence_transformers"]:
similarity = "cosine"
else:
@ -273,7 +273,7 @@ def add_precomputed_embeddings(embeddings_dir, embeddings_filenames, docs):
ret = []
id_to_doc = {x.meta["passage_id"]: x for x in docs}
for ef in embeddings_filenames:
logger.info(f"Adding precomputed embeddings from {embeddings_dir + ef}")
logger.info("Adding precomputed embeddings from %s", embeddings_dir + ef)
filename = embeddings_dir + ef
embeds = pickle.load(open(filename, "rb"))
for i, vec in embeds:
@ -284,7 +284,7 @@ def add_precomputed_embeddings(embeddings_dir, embeddings_filenames, docs):
# In the official DPR repo, there are only 20594995 precomputed embeddings for 21015324 wikipedia passages
# If there isn't an embedding for a given doc, we remove it here
ret = [x for x in ret if x.embedding is not None]
logger.info(f"Embeddings loaded for {len(ret)}/{len(docs)} docs")
logger.info("Embeddings loaded for %s/%s docs", len(ret), len(docs))
return ret

View File

@ -170,16 +170,16 @@ def download_from_url(url: str, filepath: Union[str, Path]):
:return: local path of the downloaded file
"""
logger.info(f"Downloading {url}")
logger.info("Downloading %s", url)
# Create local folder
folder, filename = os.path.split(filepath)
if not os.path.exists(folder):
os.makedirs(folder)
# Download file if not present locally
if os.path.exists(filepath):
logger.info(f"Skipping {url} (exists locally)")
logger.info("Skipping %s (exists locally)", url)
else:
logger.info(f"Downloading {url} to {filepath} ")
logger.info("Downloading %s to %s", filepath)
with open(filepath, "wb") as file:
http_get(url=url, temp_file=file)
return filepath

View File

@ -1024,7 +1024,7 @@ def adaptive_model_qa(num_processes):
current_process = psutil.Process()
children = current_process.children()
if len(children) != 0:
logging.error(f"Not all the subprocesses are closed! {len(children)} are still running.")
logging.error("Not all the subprocesses are closed! %s are still running.", len(children))
@pytest.fixture

View File

@ -132,7 +132,7 @@ class Index:
response: dict = {"namespace": namespace, "vectors": {}}
if namespace not in self.index_config.namespaces:
# If we query an empty/non-existent namespace, Pinecone will just return an empty response
logger.warning(f"No namespace called '{namespace}'")
logger.warning("No namespace called '%s'", namespace)
return response
records = self.index_config.namespaces[namespace]
namespace_ids = records.keys()

View File

@ -56,11 +56,11 @@ def reduce_windows_recursion_limit():
default_recursion_limit = sys.getrecursionlimit()
if is_windows:
reduced_recursion_limit = default_recursion_limit // 2
logger.warning(f"Reducing recursion limit to {reduced_recursion_limit}")
logger.warning("Reducing recursion limit to %s", reduced_recursion_limit)
sys.setrecursionlimit(reduced_recursion_limit)
yield
if is_windows:
logger.warning(f"Resetting recursion limit to {default_recursion_limit}")
logger.warning("Resetting recursion limit to %s", default_recursion_limit)
sys.setrecursionlimit(default_recursion_limit)

View File

@ -30,9 +30,9 @@ def tutorial5_evaluation():
doc_index = "tutorial5_docs"
label_index = "tutorial5_labels"
##############################################
###############################################
# Code
##############################################
###############################################
launch_es()
# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents with one question per document and multiple annotated answers