diff --git a/haystack/document_stores/deepsetcloud.py b/haystack/document_stores/deepsetcloud.py index 4dd495a63..df188d1d6 100644 --- a/haystack/document_stores/deepsetcloud.py +++ b/haystack/document_stores/deepsetcloud.py @@ -29,7 +29,7 @@ def disable_and_log(func): args_as_kwargs = args_to_kwargs(args, func) parameters = {**args_as_kwargs, **kwargs} - logger.info(f"Input to {func.__name__}: {parameters}") + logger.info("Input to %s: %s", func.__name__, parameters) return wrapper diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py index 3c3bbfd53..ba224aaa0 100644 --- a/haystack/document_stores/elasticsearch.py +++ b/haystack/document_stores/elasticsearch.py @@ -200,7 +200,7 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore): """ # Check if index_name refers to an alias if self.client.indices.exists_alias(name=index_name): - logger.debug(f"Index name {index_name} is an alias.") + logger.debug("Index name %s is an alias.", index_name) # check if the existing index has the embedding field; if not create it if self.client.indices.exists(index=index_name, headers=headers): @@ -949,7 +949,7 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore): all_terms_must_match=all_terms_must_match, ) - logger.debug(f"Retriever query: {body}") + logger.debug("Retriever query: %s", body) result = self.client.search(index=index, body=body, headers=headers)["hits"]["hits"] documents = [ @@ -1086,7 +1086,7 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore): body.append(headers) body.append(cur_query_body) - logger.debug(f"Retriever query: {body}") + logger.debug("Retriever query: %s", body) responses = self.client.msearch(index=index, body=body) all_documents = [] @@ -1287,7 +1287,7 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore): if excluded_meta_data: body["_source"] = {"excludes": excluded_meta_data} - logger.debug(f"Retriever query: {body}") + logger.debug("Retriever query: %s", body) try: result = self.client.search(index=index, body=body, request_timeout=300, headers=headers)["hits"]["hits"] if len(result) == 0: @@ -1458,12 +1458,16 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore): if update_existing_embeddings: document_count = self.get_document_count(index=index, headers=headers) - logger.info(f"Updating embeddings for all {document_count} docs ...") else: document_count = self.get_document_count( index=index, filters=filters, only_documents_without_embedding=True, headers=headers ) - logger.info(f"Updating embeddings for {document_count} docs without embeddings ...") + + logger.info( + "Updating embeddings for all %s docs %s...", + document_count, + "without embeddings" if not update_existing_embeddings else "", + ) result = self._get_all_documents_in_index( index=index, @@ -1674,7 +1678,7 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore): def _delete_index(self, index: str): if self.client.indices.exists(index): self.client.indices.delete(index=index, ignore=[400, 404]) - logger.info(f"Index '{index}' deleted.") + logger.info("Index '%s' deleted.", index) class ElasticsearchDocumentStore(BaseElasticsearchDocumentStore): diff --git a/haystack/document_stores/faiss.py b/haystack/document_stores/faiss.py index 59444b17e..729c8e00a 100644 --- a/haystack/document_stores/faiss.py +++ b/haystack/document_stores/faiss.py @@ -346,7 +346,7 @@ class FAISSDocumentStore(SQLDocumentStore): logger.warning("Calling DocumentStore.update_embeddings() on an empty index") return - logger.info(f"Updating embeddings for {document_count} docs...") + logger.info("Updating embeddings for %s docs...", document_count) vector_id = sum(index.ntotal for index in self.faiss_indexes.values()) result = self._query( @@ -568,7 +568,7 @@ class FAISSDocumentStore(SQLDocumentStore): ) if index in self.faiss_indexes: del self.faiss_indexes[index] - logger.info(f"Index '{index}' deleted.") + logger.info("Index '%s' deleted.", index) super().delete_index(index) def query_by_embedding( diff --git a/haystack/document_stores/memory.py b/haystack/document_stores/memory.py index 3871a1569..f99efb294 100644 --- a/haystack/document_stores/memory.py +++ b/haystack/document_stores/memory.py @@ -451,11 +451,10 @@ class InMemoryDocumentStore(BaseDocumentStore): result = self._query( index=index, filters=filters, only_documents_without_embedding=not update_existing_embeddings ) - document_count = len(result) - logger.info(f"Updating embeddings for {document_count} docs ...") + logger.info("Updating embeddings for %s docs ...", len(result) if logger.level > logging.DEBUG else 0) batched_documents = get_batches_from_generator(result, batch_size) with tqdm( - total=document_count, disable=not self.progress_bar, position=0, unit=" docs", desc="Updating Embedding" + total=len(result), disable=not self.progress_bar, position=0, unit=" docs", desc="Updating Embedding" ) as progress_bar: for document_batch in batched_documents: embeddings = retriever.embed_documents(document_batch) # type: ignore @@ -782,7 +781,7 @@ class InMemoryDocumentStore(BaseDocumentStore): """ if index in self.indexes: del self.indexes[index] - logger.info(f"Index '{index}' deleted.") + logger.info("Index '%s' deleted.", index) def delete_labels( self, diff --git a/haystack/document_stores/memory_knowledgegraph.py b/haystack/document_stores/memory_knowledgegraph.py index 6108d1025..eda103322 100644 --- a/haystack/document_stores/memory_knowledgegraph.py +++ b/haystack/document_stores/memory_knowledgegraph.py @@ -37,7 +37,7 @@ class InMemoryKnowledgeGraph(BaseKnowledgeGraph): if index not in self.indexes: self.indexes[index] = Graph() else: - logger.warning(f"Index '{index}' is already present.") + logger.warning("Index '%s' is already present.", index) def delete_index(self, index: Optional[str] = None): """ @@ -49,7 +49,7 @@ class InMemoryKnowledgeGraph(BaseKnowledgeGraph): if index in self.indexes: del self.indexes[index] - logger.info(f"Index '{index}' deleted.") + logger.info("Index '%s' deleted.", index) def import_from_ttl_file(self, path: Path, index: Optional[str] = None): """ diff --git a/haystack/document_stores/milvus1.py b/haystack/document_stores/milvus1.py index 2a86c0d20..a04513f16 100644 --- a/haystack/document_stores/milvus1.py +++ b/haystack/document_stores/milvus1.py @@ -313,13 +313,6 @@ class Milvus1DocumentStore(SQLDocumentStore): index = index or self.index self._create_collection_and_index_if_not_exist(index) - document_count = self.get_document_count(index=index) - if document_count == 0: - logger.warning("Calling DocumentStore.update_embeddings() on an empty index") - return - - logger.info(f"Updating embeddings for {document_count} docs...") - result = self._query( index=index, vector_ids=None, @@ -327,6 +320,14 @@ class Milvus1DocumentStore(SQLDocumentStore): filters=filters, only_documents_without_embedding=not update_existing_embeddings, ) + + document_count = len(result) + if document_count == 0: + logger.warning("Calling DocumentStore.update_embeddings() on an empty index") + return + + logger.info("Updating embeddings for %s docs...", document_count) + batched_documents = get_batches_from_generator(result, batch_size) with tqdm( total=document_count, disable=not self.progress_bar, position=0, unit=" docs", desc="Updating Embedding" @@ -665,10 +666,10 @@ class Milvus1DocumentStore(SQLDocumentStore): index = index or self.index status, collection_info = self.milvus_server.get_collection_stats(collection_name=index) if not status.OK(): - logger.info(f"Failed fetch stats from store ...") + logger.info("Failed fetch stats from store ...") return list() - logger.debug(f"collection_info = {collection_info}") + logger.debug("collection_info = %s", collection_info) ids = list() partition_list = collection_info["partitions"] @@ -679,16 +680,16 @@ class Milvus1DocumentStore(SQLDocumentStore): status, id_list = self.milvus_server.list_id_in_segment( collection_name=index, segment_name=segment_name ) - logger.debug(f"{status}: segment {segment_name} has {len(id_list)} vectors ...") + logger.debug("%s: segment %s has %s vectors ...", status, segment_name, len(id_list)) ids.extend(id_list) if len(ids) == 0: - logger.info(f"No documents in the store ...") + logger.info("No documents in the store ...") return list() status, vectors = self.milvus_server.get_entity_by_id(collection_name=index, ids=ids) if not status.OK(): - logger.info(f"Failed fetch document for ids {ids} from store ...") + logger.info("Failed fetch document for ids %s from store ...", ids) return list() return vectors diff --git a/haystack/document_stores/milvus2.py b/haystack/document_stores/milvus2.py index d4d70e510..980c0b314 100644 --- a/haystack/document_stores/milvus2.py +++ b/haystack/document_stores/milvus2.py @@ -204,7 +204,7 @@ class Milvus2DocumentStore(SQLDocumentStore): for field in custom_fields: if field.name == self.id_field or field.name == self.embedding_field: - logger.warning(f"Skipping `{field.name}` as it is similar to `id_field` or `embedding_field`") + logger.warning("Skipping '%s' as it is similar to 'id_field' or 'embedding_field'", field.name) else: fields.append(field) @@ -353,7 +353,7 @@ class Milvus2DocumentStore(SQLDocumentStore): logger.warning("Calling DocumentStore.update_embeddings() on an empty index") return - logger.info(f"Updating embeddings for {document_count} docs...") + logger.info("Updating embeddings for %s docs...", document_count) result = self._query( index=index, @@ -516,7 +516,7 @@ class Milvus2DocumentStore(SQLDocumentStore): def _delete_index(self, index: str): if utility.has_collection(collection_name=index): utility.drop_collection(collection_name=index) - logger.info(f"Index '{index}' deleted.") + logger.info("Index '%s' deleted.", index) super().delete_index(index) def get_all_documents_generator( diff --git a/haystack/document_stores/opensearch.py b/haystack/document_stores/opensearch.py index 823a94cd9..4d7bc39d6 100644 --- a/haystack/document_stores/opensearch.py +++ b/haystack/document_stores/opensearch.py @@ -379,7 +379,7 @@ class OpenSearchDocumentStore(BaseElasticsearchDocumentStore): if excluded_meta_data: body["_source"] = {"excludes": excluded_meta_data} - logger.debug(f"Retriever query: {body}") + logger.debug("Retriever query: %s", body) result = self.client.search(index=index, body=body, request_timeout=300, headers=headers)["hits"]["hits"] documents = [ @@ -396,7 +396,7 @@ class OpenSearchDocumentStore(BaseElasticsearchDocumentStore): """ # Check if index_name refers to an alias if self.client.indices.exists_alias(name=index_name): - logger.debug(f"Index name {index_name} is an alias.") + logger.debug("Index name %s is an alias.", index_name) # check if the existing index has the embedding field; if not create it if self.client.indices.exists(index=index_name, headers=headers): diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index de5e1d3a3..a4616b638 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -216,7 +216,7 @@ class PineconeDocumentStore(BaseDocumentStore): stats = index_connection.describe_index_stats() dims = stats["dimension"] count = stats["namespaces"][""]["vector_count"] if stats["namespaces"].get("") else 0 - logger.info(f"Index statistics: name: {index}, embedding dimensions: {dims}, record count: {count}") + logger.info("Index statistics: name: %s embedding dimensions: %s, record count: %s", index, dims, count) # return index connection return index_connection @@ -471,7 +471,7 @@ class PineconeDocumentStore(BaseDocumentStore): logger.warning("Calling DocumentStore.update_embeddings() on an empty index") return - logger.info(f"Updating embeddings for {document_count} docs...") + logger.info("Updating embeddings for %s docs...", document_count) # If the embedding namespace is empty or the user does not want to update existing embeddings, we use document namespace if self.get_embedding_count(index=index) == 0 or not update_existing_embeddings: @@ -1003,7 +1003,7 @@ class PineconeDocumentStore(BaseDocumentStore): index = self._index_name(index) if index in pinecone.list_indexes(): pinecone.delete_index(index) - logger.info(f"Index '{index}' deleted.") + logger.info("Index '%s' deleted.", index) if index in self.pinecone_indexes: del self.pinecone_indexes[index] if index in self.all_ids: diff --git a/haystack/document_stores/sql.py b/haystack/document_stores/sql.py index 5248801cd..61e547e0e 100644 --- a/haystack/document_stores/sql.py +++ b/haystack/document_stores/sql.py @@ -402,7 +402,7 @@ class SQLDocumentStore(BaseDocumentStore): try: meta_orms.append(MetaDocumentORM(name=key, value=value)) except TypeError as ex: - logger.error(f"Document {doc.id} - {ex}") + logger.error("Document %s - %s", doc.id, ex) doc_mapping = { "id": doc.id, "content": doc.to_dict()["content"], @@ -425,7 +425,7 @@ class SQLDocumentStore(BaseDocumentStore): try: self.session.commit() except Exception as ex: - logger.error(f"Transaction rollback: {ex.__cause__}") + logger.error("Transaction rollback: %s", ex.__cause__) # Rollback is important here otherwise self.session will be in inconsistent state and next call will fail self.session.rollback() raise ex @@ -495,7 +495,7 @@ class SQLDocumentStore(BaseDocumentStore): try: self.session.commit() except Exception as ex: - logger.error(f"Transaction rollback: {ex.__cause__}") + logger.error("Transaction rollback: %s", ex.__cause__) self.session.rollback() raise ex diff --git a/haystack/document_stores/utils.py b/haystack/document_stores/utils.py index ff14a5c74..b010973a0 100644 --- a/haystack/document_stores/utils.py +++ b/haystack/document_stores/utils.py @@ -34,7 +34,7 @@ def eval_data_from_json( with open(filename, "r", encoding="utf-8") as file: data = json.load(file) if "title" not in data["data"][0]: - logger.warning(f"No title information found for documents in QA file: {filename}") + logger.warning("No title information found for documents in QA file: %s", filename) for squad_document in data["data"]: if max_docs: diff --git a/haystack/document_stores/weaviate.py b/haystack/document_stores/weaviate.py index 057ae4720..11dcce0a7 100644 --- a/haystack/document_stores/weaviate.py +++ b/haystack/document_stores/weaviate.py @@ -313,7 +313,7 @@ class WeaviateDocumentStore(BaseDocumentStore): try: result = self.weaviate_client.data_object.get_by_id(id, with_vector=True) except weaviate.exceptions.UnexpectedStatusCodeException as usce: - logging.debug(f"Weaviate could not get the document requested: {usce}") + logging.debug("Weaviate could not get the document requested: %s", usce) if result: document = self._convert_weaviate_result_to_document(result, return_embedding=True) return document @@ -340,7 +340,7 @@ class WeaviateDocumentStore(BaseDocumentStore): try: result = self.weaviate_client.data_object.get_by_id(id, with_vector=True) except weaviate.exceptions.UnexpectedStatusCodeException as usce: - logging.debug(f"Weaviate could not get the document requested: {usce}") + logging.debug("Weaviate could not get the document requested: %s", usce) if result: document = self._convert_weaviate_result_to_document(result, return_embedding=True) documents.append(document) @@ -560,7 +560,7 @@ class WeaviateDocumentStore(BaseDocumentStore): and "error" in result["result"]["errors"] ): for message in result["result"]["errors"]["error"]: - logger.error(f"{message['message']}") + logger.error(message["message"]) progress_bar.update(batch_size) progress_bar.close() @@ -1215,7 +1215,10 @@ class WeaviateDocumentStore(BaseDocumentStore): raise RuntimeError("Specify the arg `embedding_field` when initializing WeaviateDocumentStore()") if update_existing_embeddings: - logger.info(f"Updating embeddings for all {self.get_document_count(index=index)} docs ...") + logger.info( + "Updating embeddings for all %s docs ...", + self.get_document_count(index=index) if logger.level > logging.DEBUG else 0, + ) else: raise RuntimeError( "All the documents in Weaviate store have an embedding by default. Only update is allowed!" @@ -1379,7 +1382,7 @@ class WeaviateDocumentStore(BaseDocumentStore): index = self._sanitize_index_name(index) or index if any(c for c in self.weaviate_client.schema.get()["classes"] if c["class"] == index): self.weaviate_client.schema.delete_class(index) - logger.info(f"Index '{index}' deleted.") + logger.info("Index '%s' deleted.", index) def delete_labels(self): """ diff --git a/haystack/modeling/data_handler/data_silo.py b/haystack/modeling/data_handler/data_silo.py index 91271a7c0..cdb50a8ce 100644 --- a/haystack/modeling/data_handler/data_silo.py +++ b/haystack/modeling/data_handler/data_silo.py @@ -221,7 +221,7 @@ class DataSilo: """ Load serialized dataset from a cache. """ - logger.info(f"Loading datasets from cache at {cache_dir}") + logger.info("Loading datasets from cache at %s", cache_dir) self.data["train"] = torch.load(cache_dir / "train_dataset") dev_dataset_path = cache_dir / "dev_dataset" @@ -277,7 +277,7 @@ class DataSilo: torch.save(self.data["test"], cache_dir / "test_dataset") torch.save(self.tensor_names, cache_dir / "tensor_names") - logger.info(f"Cached the datasets at {cache_dir}") + logger.info("Cached the datasets at %s", cache_dir) def _initialize_data_loaders(self): """ diff --git a/haystack/modeling/data_handler/dataset.py b/haystack/modeling/data_handler/dataset.py index 1d318fcb3..3df1d4793 100644 --- a/haystack/modeling/data_handler/dataset.py +++ b/haystack/modeling/data_handler/dataset.py @@ -63,7 +63,7 @@ def convert_features_to_dataset(features): ) except: logger.debug( - f"Could not determine type for feature '{t_name}'. " "Converting now to a tensor of default type long." + "Could not determine type for feature '%s'. Converting now to a tensor of default type long.", t_name ) # Convert all remaining python objects to torch long tensors diff --git a/haystack/modeling/data_handler/processor.py b/haystack/modeling/data_handler/processor.py index 7bdf153a5..60117629a 100644 --- a/haystack/modeling/data_handler/processor.py +++ b/haystack/modeling/data_handler/processor.py @@ -147,7 +147,7 @@ class Processor(ABC): sig = signature(cls.subclasses[processor_name]) unused_args = {k: v for k, v in kwargs.items() if k not in sig.parameters} logger.debug( - f"Got more parameters than needed for loading {processor_name}: {unused_args}. " f"Those won't be used!" + "Got more parameters than needed for loading %s: %s. Those won't be used!", processor_name, unused_args ) processor = cls.subclasses[processor_name]( data_dir=data_dir, @@ -324,7 +324,9 @@ class Processor(ABC): if problematic_sample_ids: n_problematic = len(problematic_sample_ids) problematic_id_str = ", ".join([str(i) for i in problematic_sample_ids]) - logger.error(f"Unable to convert {n_problematic} samples to features. Their ids are : {problematic_id_str}") + logger.error( + "Unable to convert %s samples to features. Their ids are : %s", n_problematic, problematic_id_str + ) @staticmethod def _check_sample_features(basket: SampleBasket): @@ -348,7 +350,7 @@ class Processor(ABC): return True def _log_samples(self, n_samples: int, baskets: List[SampleBasket]): - logger.debug("*** Show {} random examples ***".format(n_samples)) + logger.debug("*** Show %s random examples ***", n_samples) if len(baskets) == 0: logger.debug("*** No samples to show because there are no baskets ***") return @@ -1817,7 +1819,7 @@ class TextClassificationProcessor(Processor): self.header = header self.max_samples = max_samples self.dev_stratification = dev_stratification - logger.debug(f"Currently no support in Processor for returning problematic ids") + logger.debug("Currently no support in Processor for returning problematic ids") super(TextClassificationProcessor, self).__init__( tokenizer=tokenizer, @@ -2141,14 +2143,14 @@ def write_squad_predictions(predictions, out_filename, predictions_filename=None dev_labels[q["id"]] = q["answers"][0]["text"] not_included = set(list(dev_labels.keys())) - set(list(predictions_json.keys())) if len(not_included) > 0: - logger.info(f"There were missing predicitons for question ids: {list(not_included)}") + logger.info("There were missing predicitons for question ids: %s", list(not_included)) for x in not_included: predictions_json[x] = "" # os.makedirs("model_output", exist_ok=True) # filepath = Path("model_output") / out_filename json.dump(predictions_json, open(out_filename, "w")) - logger.info(f"Written Squad predictions to: {out_filename}") + logger.info("Written Squad predictions to: %s", out_filename) def _read_dpr_json( @@ -2188,7 +2190,7 @@ def _read_dpr_json( """ # get remote dataset if needed if not os.path.exists(file): - logger.info(f" Couldn't find {file} locally. Trying to download ...") + logger.info("Couldn't find %s locally. Trying to download ...", file) _download_extract_downstream_data(file, proxies=proxies) if Path(file).suffix.lower() == ".jsonl": @@ -2250,7 +2252,7 @@ def _read_dpr_json( def _read_squad_file(filename: str, proxies=None): """Read a SQuAD json file""" if not os.path.exists(filename): - logger.info(f" Couldn't find {filename} locally. Trying to download ...") + logger.info("Couldn't find %s locally. Trying to download ...", filename) _download_extract_downstream_data(filename, proxies) with open(filename, "r", encoding="utf-8") as reader: input_data = json.load(reader)["data"] diff --git a/haystack/modeling/evaluation/eval.py b/haystack/modeling/evaluation/eval.py index 7ff9ee966..4df198b2d 100644 --- a/haystack/modeling/evaluation/eval.py +++ b/haystack/modeling/evaluation/eval.py @@ -112,10 +112,10 @@ class Evaluator: for head_num, head in enumerate(model.prediction_heads): if head.model_type == "span_classification" and calibrate_conf_scores: temperature_previous = head.temperature_for_confidence.item() - logger.info(f"temperature used for confidence scores before calibration: {temperature_previous}") + logger.info("temperature used for confidence scores before calibration: %s", temperature_previous) head.calibrate_conf(logits_all[head_num], label_all[head_num]) temperature_current = head.temperature_for_confidence.item() - logger.info(f"temperature used for confidence scores after calibration: {temperature_current}") + logger.info("temperature used for confidence scores after calibration: %s", temperature_current) temperature_change = (abs(temperature_current - temperature_previous) / temperature_previous) * 100.0 if temperature_change > 50: logger.warning( diff --git a/haystack/modeling/infer.py b/haystack/modeling/infer.py index 4542b4946..c7c289b4c 100644 --- a/haystack/modeling/infer.py +++ b/haystack/modeling/infer.py @@ -283,7 +283,7 @@ class Inferencer: else: num_processes = mp.cpu_count() self.process_pool = mp.Pool(processes=num_processes) - logger.info(f"Got ya {num_processes} parallel workers to do inference ...") + logger.info("Got ya %s parallel workers to do inference ...", num_processes) log_ascii_workers(n=num_processes, logger=logger) def close_multiprocessing_pool(self, join: bool = False): diff --git a/haystack/modeling/model/adaptive_model.py b/haystack/modeling/model/adaptive_model.py index 479bbd33b..a8db8f24e 100644 --- a/haystack/modeling/model/adaptive_model.py +++ b/haystack/modeling/model/adaptive_model.py @@ -142,7 +142,7 @@ class BaseAdaptiveModel: "does not currently support saving and loading" ) assert len(model_files) == len(config_files), error_str - logger.info(f"Found files for loading {len(model_files)} prediction heads") + logger.info("Found files for loading %s prediction heads", len(model_files)) return model_files, config_files @@ -573,7 +573,7 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel): try: tracker.track_params(params) except Exception as e: - logger.warning(f"ML logging didn't work: {e}") + logger.warning("ML logging didn't work: %s", e) def verify_vocab_size(self, vocab_size: int): """ diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index 9bca50b85..ba47af949 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -376,7 +376,7 @@ class BiAdaptiveModel(nn.Module): try: tracker.track_params(params) except Exception as e: - logger.warning(f"ML logging didn't work: {e}") + logger.warning("ML logging didn't work: %s", e) def verify_vocab_size(self, vocab_size1: int, vocab_size2: int): """ diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 0d46f9c55..020cec673 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -488,7 +488,7 @@ class HFLanguageModelNoSegmentIds(HFLanguageModelWithPooler): specified using the arguments `output_hidden_states` and `output_attentions`. """ if segment_ids is not None: - logging.warning(f"`segment_ids` is not None, but {self.name} does not use them. They will be ignored.") + logger.warning(f"'segment_ids' is not None, but %s does not use them. They will be ignored.", self.name) return super().forward( input_ids=input_ids, @@ -906,7 +906,7 @@ def get_language_model( "Ensure that the model class name can be inferred from the directory name " "when loading a Transformers model." ) - logger.error(f"Using the AutoModel class for '{pretrained_model_name_or_path}'. This can cause crashes!") + logger.error("Using the AutoModel class for '%s'. This can cause crashes!", pretrained_model_name_or_path) model_type = "Auto" # Find the class corresponding to this model type @@ -918,7 +918,9 @@ def get_language_model( f"Supported model types are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}" ) - logger.info(f" * LOADING MODEL: '{pretrained_model_name_or_path}' {'(' + model_type + ')' if model_type else ''}") + logger.info( + " * LOADING MODEL: '%s' %s", pretrained_model_name_or_path, "(" + model_type + ")" if model_type else "" + ) # Instantiate the class for this model language_model = language_model_class( @@ -930,8 +932,10 @@ def get_language_model( model_kwargs=model_kwargs, ) logger.info( - f"Loaded '{pretrained_model_name_or_path}' ({model_type} model) " - f"from {'local file system' if config_file_exists else 'model hub'}." + "Loaded '%s' (%s model) from %s.", + pretrained_model_name_or_path, + model_type, + "local file system" if config_file_exists else "model hub", ) return language_model @@ -963,7 +967,7 @@ def _get_model_type( model_type = config.architectures[0] if is_supported_model(config.architectures[0]) else None except Exception as e: - logger.error(f"AutoConfig failed to load on '{model_name_or_path}': {str(e)}") + logger.error("AutoConfig failed to load on '%s': %s", model_name_or_path, e) if not model_type: logger.warning("Could not infer the model type from its config. Looking for clues in the model name.") @@ -991,5 +995,5 @@ def _guess_language(name: str) -> str: language = languages[0] else: language = "english" - logger.info(f"Auto-detected model language: {language}") + logger.info("Auto-detected model language: %s", language) return language diff --git a/haystack/modeling/model/optimization.py b/haystack/modeling/model/optimization.py index fe3d22448..e67c50b8b 100644 --- a/haystack/modeling/model/optimization.py +++ b/haystack/modeling/model/optimization.py @@ -188,7 +188,7 @@ def _get_optim(model, opts: Dict): optimizer_name = opts.pop("name", None) # Logging - logger.info(f"Loading optimizer `{optimizer_name}`: '{opts}'") + logger.info("Loading optimizer '%s': %s", optimizer_name, opts) tracker.track_params(opts) tracker.track_params({"optimizer_name": optimizer_name}) @@ -271,7 +271,7 @@ def get_scheduler(optimizer, opts): except AttributeError: raise AttributeError(f"Scheduler '{schedule_name}' not found in 'torch' or 'transformers'") - logger.info(f"Using scheduler '{schedule_name}'") + logger.info("Using scheduler '%s'", schedule_name) # get supported args of constructor allowed_args = inspect.signature(sched_constructor).parameters.keys() @@ -285,7 +285,7 @@ def get_scheduler(optimizer, opts): constructor_opts = {k: v for k, v in opts.items() if k in allowed_args} # Logging - logger.info(f"Loading schedule `{schedule_name}`: '{constructor_opts}'") + logger.info("Loading schedule '%s': '%s'", schedule_name, constructor_opts) tracker.track_params(constructor_opts) tracker.track_params({"schedule_name": schedule_name}) diff --git a/haystack/modeling/model/prediction_head.py b/haystack/modeling/model/prediction_head.py index 5905d2290..75325c359 100644 --- a/haystack/modeling/model/prediction_head.py +++ b/haystack/modeling/model/prediction_head.py @@ -265,7 +265,7 @@ class QuestionAnsweringHead(PredictionHead): self.layer_dims = layer_dims assert self.layer_dims[-1] == 2 self.feed_forward = FeedForwardBlock(self.layer_dims) - logger.debug(f"Prediction head initialized with size {self.layer_dims}") + logger.debug("Prediction head initialized with size %s", self.layer_dims) self.num_labels = self.layer_dims[-1] self.ph_output_type = "per_token_squad" self.model_type = "span_classification" # predicts start and end token of answer diff --git a/haystack/modeling/model/triadaptive_model.py b/haystack/modeling/model/triadaptive_model.py index 9a76dab0d..b847b4c6e 100644 --- a/haystack/modeling/model/triadaptive_model.py +++ b/haystack/modeling/model/triadaptive_model.py @@ -409,7 +409,7 @@ class TriAdaptiveModel(nn.Module): try: tracker.track_params(params) except Exception as e: - logger.warning(f"ML logging didn't work: {e}") + logger.warning("ML logging didn't work: %s", e) def verify_vocab_size(self, vocab_size1: int, vocab_size2: int, vocab_size3: int): """Verifies that the model fits to the tokenizer vocabulary. diff --git a/haystack/modeling/training/base.py b/haystack/modeling/training/base.py index ae610312f..e278ac4f7 100644 --- a/haystack/modeling/training/base.py +++ b/haystack/modeling/training/base.py @@ -192,9 +192,9 @@ class Trainer: # when resuming training from a checkpoint, we want to fast forward to the step of the checkpoint if resume_from_step and step <= resume_from_step: if step % 10000 == 0: - logger.info(f"Skipping {step} out of {resume_from_step} steps ...") + logger.info("Skipping %s out of %s steps ...", step, resume_from_step) if resume_from_step == step: - logger.info(f"Finished skipping {resume_from_step} steps ...") + logger.info("Finished skipping %s steps ...", resume_from_step) resume_from_step = None else: continue @@ -385,9 +385,9 @@ class Trainer: trainer = cls._load_checkpoint( path=checkpoint_to_load, data_silo=data_silo, model=model, optimizer=optimizer, local_rank=local_rank ) - logging.info(f"Resuming training from the train checkpoint at {checkpoint_to_load} ...") + logging.info("Resuming training from the train checkpoint at %s ...", checkpoint_to_load) else: - logging.info(f"No train checkpoints found. Starting a new training ...") + logging.info("No train checkpoints found. Starting a new training ...") trainer = cls( data_silo=data_silo, model=model, @@ -447,7 +447,7 @@ class Trainer: data_silo=data_silo, model=model, optimizer=optimizer, lr_schedule=scheduler, **trainer_state_dict ) - logger.info(f"Loaded a train checkpoint from {path}") + logger.info("Loaded a train checkpoint from %s", path) return trainer @classmethod @@ -516,7 +516,7 @@ class Trainer: for cp in saved_checkpoints[self.checkpoints_to_keep :]: shutil.rmtree(cp) - logger.info(f"Saved a training checkpoint after {checkpoint_name}") + logger.info("Saved a training checkpoint after %s", checkpoint_name) def _get_state_dict(self): """ diff --git a/haystack/modeling/utils.py b/haystack/modeling/utils.py index 21aa54038..755082a05 100644 --- a/haystack/modeling/utils.py +++ b/haystack/modeling/utils.py @@ -102,8 +102,9 @@ def initialize_device_settings( device_to_replace = torch.device("cuda") devices_to_use = [torch.device("cuda:0") if device == device_to_replace else device for device in devices_to_use] - logger.info(f"Using devices: {', '.join([str(device) for device in devices_to_use]).upper()}") - logger.info(f"Number of GPUs: {n_gpu}") + logger.info( + "Using devices: %s - Number of GPUs: %s", ", ".join([str(device) for device in devices_to_use]).upper(), n_gpu + ) return devices_to_use, n_gpu @@ -135,7 +136,7 @@ def try_get(keys, dictionary): ret = ret[0] return ret except Exception as e: - logger.warning(f"Cannot extract from dict {dictionary} with error: {e}") + logger.warning("Cannot extract from dict %s with error: %s", dictionary, e) return None diff --git a/haystack/nodes/_json_schema.py b/haystack/nodes/_json_schema.py index c97da555f..9c63c5d64 100644 --- a/haystack/nodes/_json_schema.py +++ b/haystack/nodes/_json_schema.py @@ -176,7 +176,7 @@ def create_schema_for_node_class(node_class: Type[BaseComponent]) -> Tuple[Dict[ node_name = getattr(node_class, "__name__") - logger.info(f"Creating schema for '{node_name}'") + logger.info("Creating schema for '%s'", node_name) # Read the relevant init parameters from __init__'s signature init_method = getattr(node_class, "__init__", None) @@ -400,7 +400,7 @@ def inject_definition_in_schema(node_class: Type[BaseComponent], schema: Dict[st schema_definition, node_ref = create_schema_for_node_class(node_class) schema["definitions"].update(schema_definition) schema["properties"]["components"]["items"]["anyOf"].append(node_ref) - logger.info(f"Added definition for {getattr(node_class, '__name__')}") + logger.info("Added definition for %s", getattr(node_class, "__name__")) return schema diff --git a/haystack/nodes/answer_generator/transformers.py b/haystack/nodes/answer_generator/transformers.py index 0714d6753..ab4a70bdf 100644 --- a/haystack/nodes/answer_generator/transformers.py +++ b/haystack/nodes/answer_generator/transformers.py @@ -124,7 +124,7 @@ class RAGenerator(BaseGenerator): if top_k > self.num_beams: top_k = self.num_beams - logger.warning(f"top_k value should not be greater than num_beams, hence setting it to {num_beams}") + logger.warning("top_k value should not be greater than num_beams, hence setting it to %s", num_beams) self.top_k = top_k @@ -238,7 +238,7 @@ class RAGenerator(BaseGenerator): if top_k > self.num_beams: top_k = self.num_beams - logger.warning(f"top_k value should not be greater than num_beams, hence setting it to {top_k}") + logger.warning("top_k value should not be greater than num_beams, hence setting it to %s", top_k) # Flatten the documents so easy to reference flat_docs_dict = self._flatten_docs(documents) @@ -382,7 +382,7 @@ class Seq2SeqGenerator(BaseGenerator): if top_k > self.num_beams: top_k = self.num_beams - logger.warning(f"top_k value should not be greater than num_beams, hence setting it to {num_beams}") + logger.warning("top_k value should not be greater than num_beams, hence setting it to %s", num_beams) self.top_k = top_k @@ -434,7 +434,7 @@ class Seq2SeqGenerator(BaseGenerator): if top_k > self.num_beams: top_k = self.num_beams - logger.warning(f"top_k value should not be greater than num_beams, hence setting it to {top_k}") + logger.warning("top_k value should not be greater than num_beams, hence setting it to %s", top_k) converter: Callable = Seq2SeqGenerator._get_converter(self.model_name_or_path) if not converter: diff --git a/haystack/nodes/connector/crawler.py b/haystack/nodes/connector/crawler.py index 72e475881..bf14a180a 100644 --- a/haystack/nodes/connector/crawler.py +++ b/haystack/nodes/connector/crawler.py @@ -216,9 +216,11 @@ class Crawler(BaseComponent): file_paths: list = [] is_not_empty = len(list(output_dir.rglob("*"))) > 0 if is_not_empty and not overwrite_existing_files: - logger.info(f"Found data stored in `{output_dir}`. Delete this first if you really want to fetch new data.") + logger.info( + "Found data stored in `%s`. Delete this first if you really want to fetch new data.", output_dir + ) else: - logger.info(f"Fetching from {urls} to `{output_dir}`") + logger.info("Fetching from %s to `%s`", urls, output_dir) # Start by writing out the initial list of urls if filter_urls: @@ -278,7 +280,7 @@ class Crawler(BaseComponent): ) -> List[Path]: paths = [] for link in urls: - logger.info(f"writing contents from `{link}`") + logger.info("writing contents from '%s'", link) self.driver.get(link) if loading_wait_time is not None: time.sleep(loading_wait_time) diff --git a/haystack/nodes/document_classifier/base.py b/haystack/nodes/document_classifier/base.py index 59702867d..c20355631 100644 --- a/haystack/nodes/document_classifier/base.py +++ b/haystack/nodes/document_classifier/base.py @@ -37,7 +37,7 @@ class BaseDocumentClassifier(BaseComponent): results = [] document_ids = [doc.id for doc in results] - logger.debug(f"Classified documents with IDs: {document_ids}") + logger.debug("Classified documents with IDs: %s", document_ids) # convert back to dicts if we are in an indexing pipeline if root_node == "File": @@ -54,11 +54,11 @@ class BaseDocumentClassifier(BaseComponent): if isinstance(documents[0], Document): document_ids = [doc.id for doc in results] - logger.debug(f"Classified documents with IDs: {document_ids}") + logger.debug("Classified documents with IDs: %s", document_ids) else: for doc_list in results: document_ids = [doc.id for doc in doc_list] - logger.debug(f"Classified documents with IDs: {document_ids}") + logger.debug("Classified documents with IDs: %s", document_ids) return output, "output_1" diff --git a/haystack/nodes/file_converter/image.py b/haystack/nodes/file_converter/image.py index a50cccf42..7d9eadf12 100644 --- a/haystack/nodes/file_converter/image.py +++ b/haystack/nodes/file_converter/image.py @@ -135,7 +135,7 @@ class ImageToTextConverter(BaseConverter): # remove lines having > 40% of words as digits AND not ending with a period(.) if remove_numeric_tables: if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."): - logger.debug(f"Removing line '{line}' from file") + logger.debug("Removing line '%s' from file", line) continue cleaned_lines.append(line) diff --git a/haystack/nodes/file_converter/pdf.py b/haystack/nodes/file_converter/pdf.py index 9e2438aca..038927470 100644 --- a/haystack/nodes/file_converter/pdf.py +++ b/haystack/nodes/file_converter/pdf.py @@ -140,7 +140,7 @@ class PDFToTextConverter(BaseConverter): # remove lines having > 40% of words as digits AND not ending with a period(.) if remove_numeric_tables: if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."): - logger.debug(f"Removing line '{line}' from {file_path}") + logger.debug("Removing line '%s' from %s", line, file_path) continue cleaned_lines.append(line) @@ -256,7 +256,7 @@ class PDFToTextOCRConverter(BaseConverter): image.save(temp_img.name) pages.append(self.image_2_text.convert(file_path=temp_img.name)[0].content) except Exception as exception: - logger.error(f"File {file_path} has an error \n {exception}") + logger.error("File %s has an error:\n%s", file_path, exception) raw_text = "\f".join(pages) document = Document(content=raw_text, meta=meta, id_hash_keys=id_hash_keys) diff --git a/haystack/nodes/file_converter/tika.py b/haystack/nodes/file_converter/tika.py index 1c8514efe..09ca76ef6 100644 --- a/haystack/nodes/file_converter/tika.py +++ b/haystack/nodes/file_converter/tika.py @@ -160,7 +160,7 @@ class TikaConverter(BaseConverter): # remove lines having > 40% of words as digits AND not ending with a period(.) if remove_numeric_tables: if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."): - logger.debug(f"Removing line '{line}' from {file_path}") + logger.debug("Removing line '%s' from %s", line, file_path) continue cleaned_lines.append(line) diff --git a/haystack/nodes/file_converter/txt.py b/haystack/nodes/file_converter/txt.py index a1b53f8a6..890c21584 100644 --- a/haystack/nodes/file_converter/txt.py +++ b/haystack/nodes/file_converter/txt.py @@ -63,7 +63,7 @@ class TextConverter(BaseConverter): # remove lines having > 40% of words as digits AND not ending with a period(.) if remove_numeric_tables: if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."): - logger.debug(f"Removing line '{line}' from {file_path}") + logger.debug("Removing line '%s' from %s", line, file_path) continue cleaned_lines.append(line) diff --git a/haystack/nodes/preprocessor/preprocessor.py b/haystack/nodes/preprocessor/preprocessor.py index b5b09f50c..cb71d1a55 100644 --- a/haystack/nodes/preprocessor/preprocessor.py +++ b/haystack/nodes/preprocessor/preprocessor.py @@ -465,7 +465,7 @@ class PreProcessor(BasePreProcessor): found_footer = self._find_longest_common_ngram(end_of_pages) if found_footer: pages = [page.replace(found_footer, "") for page in pages] - logger.debug(f"Removed header '{found_header}' and footer '{found_footer}' in document") + logger.debug("Removed header '%s' and footer '%s' in document", found_header, found_footer) text = "\f".join(pages) return text @@ -537,10 +537,10 @@ class PreProcessor(BasePreProcessor): sentence_tokenizer = nltk.data.load(f"file:{str(tokenizer_model_path)}", format="pickle") sentences = sentence_tokenizer.tokenize(text) except LookupError: - logger.exception(f"PreProcessor couldn't load sentence tokenizer from {str(tokenizer_model_path)}") + logger.exception("PreProcessor couldn't load sentence tokenizer from %s", tokenizer_model_path) except (UnpicklingError, ValueError) as e: logger.exception( - f"PreProcessor couldn't determine model format of sentence tokenizer at {str(tokenizer_model_path)}." + "PreProcessor couldn't determine model format of sentence tokenizer at %s", tokenizer_model_path ) if sentences: return sentences diff --git a/haystack/nodes/ranker/base.py b/haystack/nodes/ranker/base.py index fb0e70b03..d1e1b1bd1 100644 --- a/haystack/nodes/ranker/base.py +++ b/haystack/nodes/ranker/base.py @@ -41,7 +41,7 @@ class BaseRanker(BaseComponent): results = [] document_ids = [doc.id for doc in results] - logger.debug(f"Retrieved documents with IDs: {document_ids}") + logger.debug("Retrieved documents with IDs: %s", document_ids) output = {"documents": results} return output, "output_1" @@ -59,7 +59,7 @@ class BaseRanker(BaseComponent): for doc_list in results: document_ids = [doc.id for doc in doc_list] - logger.debug(f"Ranked documents with IDs: {document_ids}") + logger.debug("Ranked documents with IDs: %s", document_ids) output = {"documents": results} diff --git a/haystack/nodes/reader/farm.py b/haystack/nodes/reader/farm.py index 7674fe8f6..aeaeae229 100644 --- a/haystack/nodes/reader/farm.py +++ b/haystack/nodes/reader/farm.py @@ -728,7 +728,7 @@ class FARMReader(BaseReader): :param directory: Directory where the Reader model should be saved """ - logger.info(f"Saving reader model to {directory}") + logger.info("Saving reader model to %s", directory) self.inferencer.model.save(directory) self.inferencer.processor.save(directory) @@ -1026,7 +1026,7 @@ class FARMReader(BaseReader): aggregated_per_doc = defaultdict(list) for label in labels: if not label.document.id: - logger.error(f"Label does not contain a document id") + logger.error("Label does not contain a document id") continue aggregated_per_doc[label.document.id].append(label) @@ -1036,7 +1036,7 @@ class FARMReader(BaseReader): for doc_id in all_doc_ids: doc = document_store.get_document_by_id(doc_id, index=doc_index) if not doc: - logger.error(f"Document with the ID '{doc_id}' is not present in the document store.") + logger.error("Document with the ID '%s' is not present in the document store.", doc_id) continue d[str(doc_id)] = {"context": doc.content} # get all questions / answers @@ -1047,7 +1047,7 @@ class FARMReader(BaseReader): for label in aggregated_per_doc[doc_id]: aggregation_key = (doc_id, label.query) if label.answer is None: - logger.error(f"Label.answer was None, but Answer object was expected: {label} ") + logger.error("Label.answer was None, but Answer object was expected: %s", label) continue if label.answer.offsets_in_document is None: logger.error( diff --git a/haystack/nodes/reader/table.py b/haystack/nodes/reader/table.py index eca5960fe..78a5477ac 100644 --- a/haystack/nodes/reader/table.py +++ b/haystack/nodes/reader/table.py @@ -177,13 +177,13 @@ class TableReader(BaseReader): no_answer_score = 1.0 for document in documents: if document.content_type != "table": - logger.warning(f"Skipping document with id '{document.id}' in TableReader as it is not of type table.") + logger.warning("Skipping document with id '%s' in TableReader as it is not of type table.", document.id) continue table: pd.DataFrame = document.content if table.shape[0] == 0: logger.warning( - f"Skipping document with id '{document.id}' in TableReader as it does not contain any rows." + "Skipping document with id '%s' in TableReader as it does not contain any rows.", document.id ) continue # Tokenize query and current table @@ -656,13 +656,13 @@ class RCIReader(BaseReader): answers = [] for document in documents: if document.content_type != "table": - logger.warning(f"Skipping document with id '{document.id}' in RCIReader as it is not of type table.") + logger.warning("Skipping document with id '%s' in RCIReader as it is not of type table.", document.id) continue table: pd.DataFrame = document.content if table.shape[0] == 0: logger.warning( - f"Skipping document with id '{document.id}' in RCIReader as it does not contain any rows." + "Skipping document with id '%s' in RCIReader as it does not contain any rows.", document.id ) continue table = table.astype(str) diff --git a/haystack/nodes/retriever/_embedding_encoder.py b/haystack/nodes/retriever/_embedding_encoder.py index a0eb8a6fc..46e4de1bc 100644 --- a/haystack/nodes/retriever/_embedding_encoder.py +++ b/haystack/nodes/retriever/_embedding_encoder.py @@ -221,7 +221,7 @@ class _SentenceTransformersEmbeddingEncoder(_BaseEmbeddingEncoder): else: train_examples.append(InputExample(texts=texts)) - logger.info(f"Training/adapting {self.embedding_model} with {len(train_examples)} examples") + logger.info("Training/adapting %s with %s examples", self.embedding_model, len(train_examples)) train_dataloader = DataLoader(train_examples, batch_size=batch_size, drop_last=True, shuffle=True) train_loss = st_loss.loss(self.embedding_model) diff --git a/haystack/nodes/retriever/base.py b/haystack/nodes/retriever/base.py index 890137fa8..f5644d5e7 100644 --- a/haystack/nodes/retriever/base.py +++ b/haystack/nodes/retriever/base.py @@ -328,7 +328,7 @@ class BaseRetriever(BaseComponent): query=query, filters=filters, top_k=top_k, index=index, headers=headers, scale_score=scale_score ) document_ids = [doc.id for doc in documents] - logger.debug(f"Retrieved documents with IDs: {document_ids}") + logger.debug("Retrieved documents with IDs: %s", document_ids) output = {"documents": documents} return output, "output_1" @@ -351,13 +351,13 @@ class BaseRetriever(BaseComponent): if not isinstance(doc, Document): raise HaystackError(f"doc was of type {type(doc)}, but expected a Document.") document_ids.append(doc.id) - logger.debug(f"Retrieved documents with IDs: {document_ids}") + logger.debug("Retrieved documents with IDs: %s", document_ids) else: for doc_list in documents: if not isinstance(doc_list, list): raise HaystackError(f"doc_list was of type {type(doc_list)}, but expected a list of Documents.") document_ids = [doc.id for doc in doc_list] - logger.debug(f"Retrieved documents with IDs: {document_ids}") + logger.debug("Retrieved documents with IDs: %s", document_ids) output = {"documents": documents} return output, "output_1" diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py index ae76ec097..4491da525 100644 --- a/haystack/nodes/retriever/dense.py +++ b/haystack/nodes/retriever/dense.py @@ -752,7 +752,7 @@ class DensePassageRetriever(BaseRetriever): use_fast_tokenizers=use_fast_tokenizers, similarity_function=similarity_function, ) - logger.info(f"DPR model loaded from {load_dir}") + logger.info("DPR model loaded from %s", load_dir) return dpr @@ -1435,7 +1435,7 @@ class TableTextRetriever(BaseRetriever): use_fast_tokenizers=use_fast_tokenizers, similarity_function=similarity_function, ) - logger.info(f"TableTextRetriever model loaded from {load_dir}") + logger.info("TableTextRetriever model loaded from %s", load_dir) return mm_retriever @@ -1530,7 +1530,7 @@ class EmbeddingRetriever(BaseRetriever): else model_format ) - logger.info(f"Init retriever using embeddings of model {embedding_model}") + logger.info("Init retriever using embeddings of model %s", embedding_model) if self.model_format not in _EMBEDDING_ENCODERS.keys(): raise ValueError(f"Unknown retriever embedding model format {model_format}") diff --git a/haystack/nodes/retriever/sparse.py b/haystack/nodes/retriever/sparse.py index 3f87e6879..059abbc85 100644 --- a/haystack/nodes/retriever/sparse.py +++ b/haystack/nodes/retriever/sparse.py @@ -438,7 +438,7 @@ class TfidfRetriever(BaseRetriever): continue paragraphs.append(Paragraph(document_id=doc.id, paragraph_id=p_id, content=(p,), meta=doc.meta)) p_id += 1 - logger.info(f"Found {len(paragraphs)} candidate paragraphs from {len(documents)} docs in DB") + logger.info("Found %s candidate paragraphs from %s docs in DB", len(paragraphs), len(documents)) return paragraphs def _calc_scores(self, queries: Union[str, List[str]]) -> List[Dict[int, float]]: @@ -514,7 +514,9 @@ class TfidfRetriever(BaseRetriever): df_sliced = df_sliced[:top_k] logger.debug( - f"Identified {df_sliced.shape[0]} candidates via retriever:\n {df_sliced.to_string(col_space=10, index=False)}" + "Identified %s candidates via retriever:\n%s", + df_sliced.shape[0], + df_sliced.to_string(col_space=10, index=False), ) # get actual content for the top candidates @@ -585,8 +587,9 @@ class TfidfRetriever(BaseRetriever): df_sliced = self.df.loc[query_result.keys()] df_sliced = df_sliced[:top_k] logger.debug( - f"Identified {df_sliced.shape[0]} candidates via retriever:" - f"\n {df_sliced.to_string(col_space=10, index=False)}" + "Identified %s candidates via retriever:\n%s", + df_sliced.shape[0], + df_sliced.to_string(col_space=10, index=False), ) # get actual content for the top candidates diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index a61945f1a..2e2103e12 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -14,7 +14,6 @@ import json import inspect import logging import tempfile -import traceback from pathlib import Path import yaml @@ -296,14 +295,14 @@ class Pipeline: f"Deployed pipeline configs are not allowed to be updated. Please undeploy pipeline config '{pipeline_config_name}' first." ) client.update_pipeline_config(config=config, pipeline_config_name=pipeline_config_name) - logger.info(f"Pipeline config '{pipeline_config_name}' successfully updated.") + logger.info("Pipeline config '%s' successfully updated.", pipeline_config_name) else: raise ValueError( f"Pipeline config '{pipeline_config_name}' already exists. Set `overwrite=True` to overwrite pipeline config." ) else: client.save_pipeline_config(config=config, pipeline_config_name=pipeline_config_name) - logger.info(f"Pipeline config '{pipeline_config_name}' successfully created.") + logger.info("Pipeline config '%s' successfully created.", pipeline_config_name) @classmethod def deploy_on_deepset_cloud( @@ -508,13 +507,15 @@ class Pipeline: predecessors = set(nx.ancestors(self.graph, node_id)) if predecessors.isdisjoint(set(queue.keys())): # only execute if predecessor nodes are executed try: - logger.debug(f"Running node `{node_id}` with input `{node_input}`") + logger.debug("Running node '%s` with input: %s", node_id, node_input) node_output, stream_id = self._run_node(node_id, node_input) except Exception as e: - tb = traceback.format_exc() + # The input might be a really large object with thousands of embeddings. + # If you really want to see it, raise the log level. + logger.debug("Exception while running node '%s' with input %s", node_id, node_input) raise Exception( - f"Exception while running node `{node_id}` with input `{node_input}`: {e}, full stack trace: {tb}" - ) + f"Exception while running node '{node_id}': {e}\nEnable debug logging to see the data that was passed when the pipeline failed." + ) from e queue.pop(node_id) # if stream_id == "split": @@ -660,14 +661,15 @@ class Pipeline: predecessors = set(nx.ancestors(self.graph, node_id)) if predecessors.isdisjoint(set(queue.keys())): # only execute if predecessor nodes are executed try: - logger.debug(f"Running node `{node_id}` with input `{node_input}`") + logger.debug("Running node '%s` with input: %s", node_id, node_input) node_output, stream_id = self.graph.nodes[node_id]["component"]._dispatch_run_batch(**node_input) except Exception as e: - tb = traceback.format_exc() + # The input might be a really large object with thousands of embeddings. + # If you really want to see it, raise the log level. + logger.debug("Exception while running node '%s' with input %s", node_id, node_input) raise Exception( - f"Exception while running node `{node_id}` with input `{node_input}`: {e}, " - f"full stack trace: {tb}" - ) + f"Exception while running node '{node_id}': {e}\nEnable debug logging to see the data that was passed when the pipeline failed." + ) from e queue.pop(node_id) if stream_id == "split": @@ -746,7 +748,7 @@ class Pipeline: url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip" data_path = util.download_and_unzip(url, dataset_dir) - logger.info(f"Dataset downloaded here: {data_path}") + logger.info("Dataset downloaded here: %s", data_path) corpus, queries, qrels = GenericDataLoader(data_path).load(split="test") # or split = "train" or "dev" # check index before eval @@ -776,11 +778,11 @@ class Pipeline: # Clean up document store if not keep_index and document_store is not None and document_store.index is not None: - logger.info(f"Cleaning up: deleting index '{document_store.index}'...") + logger.info("Cleaning up: deleting index '%s' ...", document_store.index) document_store.delete_index(document_store.index) # Evaluate your retrieval using NDCG@k, MAP@K ... - logger.info(f"Retriever evaluation for k in: {retriever.k_values}") + logger.info("Retriever evaluation for k in: %s", retriever.k_values) ndcg, map_, recall, precision = retriever.evaluate(qrels, results, retriever.k_values) return ndcg, map_, recall, precision @@ -985,10 +987,10 @@ class Pipeline: if not reuse_index: raise HaystackError(f"Index '{document_store.index}' is not empty. Please provide an empty index.") else: - logger.info(f"indexing {len(corpus_file_paths)} documents...") + logger.info("indexing %s documents...", len(corpus_file_paths)) index_pipeline.run(file_paths=corpus_file_paths, meta=corpus_file_metas, params=index_params) document_count = document_store.get_document_count() - logger.info(f"indexing {len(evaluation_set_labels)} files to {document_count} documents finished.") + logger.info("indexing %s files to %s documents finished.", len(evaluation_set_labels), document_count) tracker.track_params({"pipeline_index_document_count": document_count}) @@ -1056,7 +1058,7 @@ class Pipeline: # Clean up document store if not reuse_index and document_store.index is not None: - logger.info(f"Cleaning up: deleting index '{document_store.index}'...") + logger.info("Cleaning up: deleting index '%s'...", document_store.index) document_store.delete_index(document_store.index) finally: @@ -1435,7 +1437,7 @@ class Pipeline: for i, (query, query_labels) in enumerate(zip(queries, query_labels_per_query)): if query_labels is None or query_labels.labels is None: - logger.warning(f"There is no label for query '{query}'. Query will be omitted.") + logger.warning("There is no label for query '%s'. Query will be omitted.", query) continue # remarks for no_answers: @@ -1925,7 +1927,7 @@ class Pipeline: component_params = definitions[name].get("params", {}) component_type = definitions[name]["type"] - logger.debug(f"Loading component `{name}` of type `{definitions[name]['type']}`") + logger.debug(f"Loading component '%s' of type '%s'", name, definitions[name]["type"]) for key, value in component_params.items(): # Component params can reference to other components. For instance, a Retriever can reference a @@ -2188,9 +2190,9 @@ class _HaystackBeirRetrieverAdapter: file_paths.append(file_path) metas.append({"id": id, "name": doc.get("title", None)}) - logger.info(f"indexing {len(corpus)} documents...") + logger.info("indexing %s documents...", len(corpus)) self.index_pipeline.run(file_paths=file_paths, meta=metas, params=self.index_params) - logger.info(f"indexing finished.") + logger.info("indexing finished.") # adjust query_params to ensure top_k is retrieved query_params = copy.deepcopy(self.query_params) diff --git a/haystack/pipelines/config.py b/haystack/pipelines/config.py index a423171d7..9262fa40d 100644 --- a/haystack/pipelines/config.py +++ b/haystack/pipelines/config.py @@ -208,7 +208,7 @@ def validate_yaml( """ pipeline_config = read_pipeline_config_from_yaml(path) validate_config(pipeline_config=pipeline_config, strict_version_check=strict_version_check, extras=extras) - logging.debug(f"'{path}' contains valid Haystack pipelines.") + logging.debug("'%s' contains valid Haystack pipelines.", path) def validate_config( @@ -343,7 +343,7 @@ def validate_schema(pipeline_config: Dict, strict_version_check: bool = False, e f"Validation failed. {validation.message}. {error_location} " "See the stacktrace for more information." ) from validation - logging.debug(f"The given configuration is valid according to the JSON schema.") + logging.debug("The given configuration is valid according to the JSON schema.") def validate_pipeline_graph(pipeline_definition: Dict[str, Any], component_definitions: Dict[str, Any]): @@ -357,7 +357,7 @@ def validate_pipeline_graph(pipeline_definition: Dict[str, Any], component_defin graph = _init_pipeline_graph(root_node_name=root_node_name) for node in pipeline_definition["nodes"]: graph = _add_node_to_pipeline_graph(graph=graph, node=node, components=component_definitions) - logging.debug(f"The graph for pipeline '{pipeline_definition['name']}' is valid.") + logging.debug("The graph for pipeline '%s' is valid.", pipeline_definition["name"]) def _find_root_in_pipeline_definition(pipeline_definition: Dict[str, Any]): diff --git a/haystack/schema.py b/haystack/schema.py index 96210fbf4..1138c09b9 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -225,6 +225,9 @@ class Document: ) def __repr__(self): + values = self.to_dict() + if values.get("embedding", False): + values["embedding"] = f"" return f"" def __str__(self): @@ -1389,7 +1392,7 @@ class EvaluationResult: index=False, quoting=csv.QUOTE_NONNUMERIC (to avoid problems with \r chars) """ out_dir = out_dir if isinstance(out_dir, Path) else Path(out_dir) - logger.info(f"Saving evaluation results to {out_dir}") + logger.info("Saving evaluation results to %s", out_dir) if not out_dir.exists(): out_dir.mkdir(parents=True) for node_name, df in self.node_results.items(): diff --git a/haystack/telemetry.py b/haystack/telemetry.py index 5749dbf19..27173ae0e 100644 --- a/haystack/telemetry.py +++ b/haystack/telemetry.py @@ -73,7 +73,7 @@ def enable_writing_events_to_file(): Enables writing each event that is sent to the log file specified in LOG_PATH """ os.environ[HAYSTACK_TELEMETRY_LOGGING_TO_FILE_ENABLED] = "True" - logger.info(f"Writing events to log file {LOG_PATH} has been enabled.") + logger.info("Writing events to log file %s has been enabled.", LOG_PATH) def disable_writing_events_to_file(): @@ -81,7 +81,7 @@ def disable_writing_events_to_file(): Disables writing each event that is sent to the log file specified in LOG_PATH """ os.environ[HAYSTACK_TELEMETRY_LOGGING_TO_FILE_ENABLED] = "False" - logger.info(f"Writing events to log file {LOG_PATH} has been disabled.") + logger.info("Writing events to log file %s has been disabled.", LOG_PATH) def is_telemetry_enabled() -> bool: @@ -235,7 +235,7 @@ def _read_telemetry_config(): if "user_id" in config and user_id is None: user_id = config["user_id"] except Exception as e: - logger.debug(f"Telemetry was not able to read the config file {CONFIG_PATH}.", exc_info=e) + logger.debug("Telemetry was not able to read the config file %s", CONFIG_PATH, exc_info=e) def _write_telemetry_config(): @@ -257,7 +257,7 @@ def _write_telemetry_config(): with open(CONFIG_PATH, "w") as outfile: yaml.dump(config, outfile, default_flow_style=False) except Exception: - logger.debug(f"Could not write config file to {CONFIG_PATH}.") + logger.debug("Could not write config file to %s", CONFIG_PATH) send_custom_event(event="config saving failed") @@ -266,7 +266,7 @@ def _write_event_to_telemetry_log_file(distinct_id: str, event: str, properties: with open(LOG_PATH, "a") as file_object: file_object.write(f"{event}, {properties}, {distinct_id}\n") except Exception as e: - logger.debug(f"Telemetry was not able to write event to log file {LOG_PATH}.", exc_info=e) + logger.debug("Telemetry was not able to write event to log file %s", LOG_PATH, exc_info=e) def _delete_telemetry_file(file_type_to_delete: TelemetryFileType): @@ -279,7 +279,7 @@ def _delete_telemetry_file(file_type_to_delete: TelemetryFileType): try: path.unlink() # todo add missing_ok=True to the unlink() call when upgrading to python>3.7 except Exception as e: - logger.debug(f"Telemetry was not able to delete the {file_type_to_delete} at {path}.", exc_info=e) + logger.debug("Telemetry was not able to delete the %s at %s", file_type_to_delete, path, exc_info=e) class NonPrivateParameters: diff --git a/haystack/utils/deepsetcloud.py b/haystack/utils/deepsetcloud.py index 820079ec4..6df72e4be 100644 --- a/haystack/utils/deepsetcloud.py +++ b/haystack/utils/deepsetcloud.py @@ -546,7 +546,7 @@ class PipelineClient: pipelines_url = f"{workspace_url}/pipelines" response = self.client.post(url=pipelines_url, data=yaml.dump(config), headers=headers).json() if "name" not in response or response["name"] != pipeline_config_name: - logger.warning(f"Unexpected response from saving pipeline config: {response}") + logger.warning("Unexpected response from saving pipeline config: %s", response) def update_pipeline_config( self, @@ -568,7 +568,7 @@ class PipelineClient: yaml_url = f"{pipeline_url}/yaml" response = self.client.put(url=yaml_url, data=yaml.dump(config), headers=headers).json() if "name" not in response or response["name"] != pipeline_config_name: - logger.warning(f"Unexpected response from updating pipeline config: {response}") + logger.warning("Unexpected response from updating pipeline config: %s", response) def deploy( self, @@ -609,9 +609,9 @@ class PipelineClient: if status == PipelineStatus.DEPLOYED: if changed: - logger.info(f"Pipeline config '{pipeline_config_name}' successfully deployed.") + logger.info("Pipeline config '%s' successfully deployed.", pipeline_config_name) else: - logger.info(f"Pipeline config '{pipeline_config_name}' is already deployed.") + logger.info("Pipeline config '%s' is already deployed.", pipeline_config_name) logger.info( f"Search endpoint for pipeline config '{pipeline_config_name}' is up and running for you under {pipeline_url}" ) @@ -628,7 +628,7 @@ class PipelineClient: f" ]\n" f"}}'" ) - logger.info(f"Try it out using the following curl command:\n{curl_cmd}") + logger.info("Try it out using the following curl command:\n%s", curl_cmd) elif status == PipelineStatus.DEPLOYMENT_FAILED: raise DeepsetCloudError( @@ -673,9 +673,9 @@ class PipelineClient: if status == PipelineStatus.UNDEPLOYED: if changed: - logger.info(f"Pipeline config '{pipeline_config_name}' successfully undeployed.") + logger.info("Pipeline config '%s' successfully undeployed.", pipeline_config_name) else: - logger.info(f"Pipeline config '{pipeline_config_name}' is already undeployed.") + logger.info("Pipeline config '%s' is already undeployed.", pipeline_config_name) elif status in [PipelineStatus.DEPLOYMENT_IN_PROGRESS, PipelineStatus.DEPLOYMENT_SCHEDULED]: raise DeepsetCloudError( f"Undeployment of pipeline config '{pipeline_config_name}' aborted. Deployment was requested." @@ -754,7 +754,7 @@ class PipelineClient: raise DeepsetCloudError(f"Pipeline config '{pipeline_config_name}' does not exist anymore.") status = PipelineStatus.from_str(pipeline_info["status"]) if status in valid_transitioning_states: - logger.info(f"Current status of '{pipeline_config_name}' is: '{status}'") + logger.info("Current status of '%s' is: '%s'", pipeline_config_name, status) time.sleep(5) return status, True @@ -907,7 +907,7 @@ class EvaluationSetClient: f"Successfully uploaded evaluation set file {file_path}. You can access it now under evaluation set '{file_path.name}'." ) except DeepsetCloudError as e: - logger.error(f"Error uploading evaluation set file {file_path}: {e.args}") + logger.error("Error uploading evaluation set file %s: %s", file_path, e.args) def get_evaluation_set( self, evaluation_set: Optional[str] = None, workspace: Optional[str] = None @@ -993,9 +993,9 @@ class FileClient: file_id = response_file_upload.json().get("file_id") file_ids.append(file_id) except Exception as e: - logger.exception(f"Error uploading file {file_path}") + logger.exception("Error uploading file %s", file_path) - logger.info(f"Successfully uploaded {len(file_ids)} files.") + logger.info("Successfully uploaded %s files.", len(file_ids)) def delete_file(self, file_id: str, workspace: Optional[str] = None, headers: dict = None): """ @@ -1147,7 +1147,7 @@ class EvaluationRunClient: eval_run_url = f"{workspace_url}/eval_runs/{eval_run_name}" response = self.client.delete(eval_run_url, headers=headers) if response.status_code == 204: - logger.info(f"Evaluation run '{eval_run_name}' deleted.") + logger.info("Evaluation run '%s' deleted.", eval_run_name) def start_eval_run(self, eval_run_name: str, workspace: Optional[str] = None, headers: dict = None): """ @@ -1162,7 +1162,7 @@ class EvaluationRunClient: eval_run_url = f"{workspace_url}/eval_runs/{eval_run_name}/start" response = self.client.post(eval_run_url, headers=headers) if response.status_code == 204: - logger.info(f"Evaluation run '{eval_run_name}' has been started.") + logger.info("Evaluation run '%s' has been started.", eval_run_name) def update_eval_run( self, diff --git a/haystack/utils/doc_store.py b/haystack/utils/doc_store.py index 208036602..2e8423149 100644 --- a/haystack/utils/doc_store.py +++ b/haystack/utils/doc_store.py @@ -79,7 +79,7 @@ def launch_weaviate(sleep=15): def stop_container(container_name, delete_container=False): - logger.debug(f"Stopping {container_name}...") + logger.debug("Stopping %s...", container_name) status = subprocess.run([f"docker stop {container_name}"], shell=True) if status.returncode: logger.warning( @@ -117,7 +117,7 @@ def stop_service(document_store, delete_container=False): elif "WeaviateDocumentStore" in ds_class: stop_weaviate(delete_container) else: - logger.warning(f"No support yet for auto stopping the service behind a {type(document_store)}") + logger.warning("No support yet for auto stopping the service behind a %s", type(document_store)) def launch_milvus(sleep=15, delete_existing=False): diff --git a/haystack/utils/docker.py b/haystack/utils/docker.py index 2f2bdc181..0e70f3676 100644 --- a/haystack/utils/docker.py +++ b/haystack/utils/docker.py @@ -28,6 +28,6 @@ def cache_models(models: List[str] = None, use_auth_token: Optional[Union[str, b import transformers for model_to_cache in models: - logging.info(f"Caching {model_to_cache}") + logging.info("Caching %s", model_to_cache) transformers.AutoTokenizer.from_pretrained(model_to_cache, use_auth_token=use_auth_token) transformers.AutoModel.from_pretrained(model_to_cache, use_auth_token=use_auth_token) diff --git a/haystack/utils/experiment_tracking.py b/haystack/utils/experiment_tracking.py index c16c0538d..1a7d40cda 100644 --- a/haystack/utils/experiment_tracking.py +++ b/haystack/utils/experiment_tracking.py @@ -117,19 +117,19 @@ class StdoutTrackingHead(BaseTrackingHead): def init_experiment( self, experiment_name: str, run_name: str = None, tags: Dict[str, Any] = None, nested: bool = False ): - logger.info(f"\n **** Starting experiment '{experiment_name}' (Run: {run_name}) ****") + logger.info("\n **** Starting experiment '%s' (Run: %s) ****", experiment_name, run_name) def track_metrics(self, metrics: Dict[str, Any], step: int): - logger.info(f"Logged metrics at step {step}: \n {metrics}") + logger.info("Logged metrics at step %s: \n %s", step, metrics) def track_params(self, params: Dict[str, Any]): - logger.info(f"Logged parameters: \n {params}") + logger.info("Logged parameters: \n %s", params) def track_artifacts(self, dir_path: Union[str, Path], artifact_path: str = None): - logger.warning(f"Cannot log artifacts with StdoutLogger: \n {dir_path}") + logger.warning("Cannot log artifacts with StdoutLogger: \n %s", dir_path) def end_run(self): - logger.info(f"**** End of Experiment **** ") + logger.info("**** End of Experiment **** ") class MLflowTrackingHead(BaseTrackingHead): @@ -148,7 +148,9 @@ class MLflowTrackingHead(BaseTrackingHead): mlflow.set_tracking_uri(self.tracking_uri) mlflow.set_experiment(experiment_name) mlflow.start_run(run_name=run_name, nested=nested, tags=tags) - logger.info(f"Tracking run {run_name} of experiment {experiment_name} by mlflow under {self.tracking_uri}") + logger.info( + "Tracking run %s of experiment %s by mlflow under %s", run_name, experiment_name, self.tracking_uri + ) if self.auto_track_environment: mlflow.log_params(flatten_dict({"environment": get_or_create_env_meta_data()})) except ConnectionError: @@ -163,9 +165,9 @@ class MLflowTrackingHead(BaseTrackingHead): metrics = flatten_dict(metrics) mlflow.log_metrics(metrics, step=step) except ConnectionError: - logger.warning(f"ConnectionError in logging metrics to MLflow.") + logger.warning("ConnectionError in logging metrics to MLflow.") except Exception as e: - logger.warning(f"Failed to log metrics: {e}") + logger.warning("Failed to log metrics: %s", e) def track_params(self, params: Dict[str, Any]): try: @@ -174,15 +176,15 @@ class MLflowTrackingHead(BaseTrackingHead): except ConnectionError: logger.warning("ConnectionError in logging params to MLflow") except Exception as e: - logger.warning(f"Failed to log params: {e}") + logger.warning("Failed to log params: %s", e) def track_artifacts(self, dir_path: Union[str, Path], artifact_path: str = None): try: mlflow.log_artifacts(dir_path, artifact_path) except ConnectionError: - logger.warning(f"ConnectionError in logging artifacts to MLflow") + logger.warning("ConnectionError in logging artifacts to MLflow") except Exception as e: - logger.warning(f"Failed to log artifacts: {e}") + logger.warning("Failed to log artifacts: %s", e) def end_run(self): mlflow.end_run() diff --git a/haystack/utils/import_utils.py b/haystack/utils/import_utils.py index 8c3de529d..8435883ee 100644 --- a/haystack/utils/import_utils.py +++ b/haystack/utils/import_utils.py @@ -81,10 +81,10 @@ def fetch_archive_from_http(url: str, output_dir: str, proxies: Optional[dict] = is_not_empty = len(list(Path(path).rglob("*"))) > 0 if is_not_empty: - logger.info(f"Found data stored in `{output_dir}`. Delete this first if you really want to fetch new data.") + logger.info("Found data stored in '%s'. Delete this first if you really want to fetch new data.", output_dir) return False else: - logger.info(f"Fetching from {url} to `{output_dir}`") + logger.info("Fetching from %s to '%s'", url, output_dir) _, _, archive_extension = url.rpartition(".") request_data = requests.get(url, proxies=proxies) diff --git a/haystack/utils/squad_to_dpr.py b/haystack/utils/squad_to_dpr.py index b6eb89433..1859d07c0 100644 --- a/haystack/utils/squad_to_dpr.py +++ b/haystack/utils/squad_to_dpr.py @@ -192,8 +192,8 @@ def create_dpr_training_dataset(squad_data: dict, retriever: BaseRetriever, num_ n_questions += 1 yield dict_DPR - logger.info(f"Number of skipped questions: {n_non_added_questions}") - logger.info(f"Number of added questions: {n_questions}") + logger.info("Number of skipped questions: %s", n_non_added_questions) + logger.info("Number of added questions: %s", n_questions) def save_dataset(iter_dpr: Iterator, dpr_output_filename: Path, total_nb_questions: int, split_dataset: bool): diff --git a/rest_api/rest_api/pipeline/__init__.py b/rest_api/rest_api/pipeline/__init__.py index 62a5e77d7..b99805e23 100644 --- a/rest_api/rest_api/pipeline/__init__.py +++ b/rest_api/rest_api/pipeline/__init__.py @@ -26,17 +26,17 @@ def setup_pipelines() -> Dict[str, Any]: # Load query pipeline query_pipeline = Pipeline.load_from_yaml(Path(config.PIPELINE_YAML_PATH), pipeline_name=config.QUERY_PIPELINE_NAME) - logging.info(f"Loaded pipeline nodes: {query_pipeline.graph.nodes.keys()}") + logging.info("Loaded pipeline nodes: %s", query_pipeline.graph.nodes.keys()) pipelines["query_pipeline"] = query_pipeline # Find document store document_store = query_pipeline.get_document_store() - logging.info(f"Loaded docstore: {document_store}") + logging.info("Loaded docstore: %s", document_store) pipelines["document_store"] = document_store # Setup concurrency limiter concurrency_limiter = RequestLimiter(config.CONCURRENT_REQUEST_PER_WORKER) - logging.info("Concurrent requests per worker: {CONCURRENT_REQUEST_PER_WORKER}") + logging.info("Concurrent requests per worker: %s", config.CONCURRENT_REQUEST_PER_WORKER) pipelines["concurrency_limiter"] = concurrency_limiter # Load indexing pipeline (if available) @@ -53,7 +53,7 @@ def setup_pipelines() -> Dict[str, Any]: except PipelineConfigError as e: indexing_pipeline = None - logger.error(f"{e.message}\nFile Upload API will not be available.") + logger.error("%s\nFile Upload API will not be available.", e.message) finally: pipelines["indexing_pipeline"] = indexing_pipeline diff --git a/test/benchmarks/model_distillation.py b/test/benchmarks/model_distillation.py index 6dc633db4..b870b42f8 100644 --- a/test/benchmarks/model_distillation.py +++ b/test/benchmarks/model_distillation.py @@ -122,7 +122,7 @@ def main(): for current_config in combine_config(distillation_settings): descriptions.append(f"Results of student with distillation (config: {current_config}") # distillation training - logger.info(f"Training student with distillation (config: {current_config}") + logger.info("Training student with distillation (config: %s)", current_config) results.append( train_student_with_distillation( student, teacher, download_folder, train_file, test_file, **current_config, **training_settings @@ -145,9 +145,9 @@ def main(): logger.info("Evaluation results:") for result, description in zip(results, descriptions): logger.info(description) - logger.info(f"EM: {result['EM']}") - logger.info(f"F1: {result['f1']}") - logger.info(f"Top n accuracy: {result['top_n_accuracy']}") + logger.info("EM: %s", result["EM"]) + logger.info("F1: %s", result["f1"]) + logger.info("Top n accuracy: %s", result["top_n_accuracy"]) if __name__ == "__main__": diff --git a/test/benchmarks/reader.py b/test/benchmarks/reader.py index 82e592277..43596706e 100644 --- a/test/benchmarks/reader.py +++ b/test/benchmarks/reader.py @@ -51,7 +51,7 @@ def benchmark_reader(ci=False, update_json=False, save_markdown=False, **kwargs) index_to_doc_store(doc_store, docs, None, labels) for reader_name in reader_models: for reader_type in reader_types: - logger.info(f"##### Start reader run - model:{reader_name}, type: {reader_type} ##### ") + logger.info("##### Start reader run - model: %s, type: %s ##### ", reader_name, reader_type) try: reader = get_reader(reader_name, reader_type) results = reader.eval( diff --git a/test/benchmarks/retriever.py b/test/benchmarks/retriever.py index 69751477a..18bed0e75 100644 --- a/test/benchmarks/retriever.py +++ b/test/benchmarks/retriever.py @@ -56,7 +56,7 @@ def benchmark_indexing( retriever_results = [] for n_docs in n_docs_options: for retriever_name, doc_store_name in retriever_doc_stores: - logger.info(f"##### Start indexing run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### ") + logger.info("##### Start indexing run: %s, %s, %s docs ##### ", retriever_name, doc_store_name, n_docs) try: doc_store = get_document_store(doc_store_name) retriever = get_retriever(retriever_name, doc_store, DEVICES) @@ -160,7 +160,7 @@ def benchmark_querying( for n_docs in n_docs_options: for retriever_name, doc_store_name in retriever_doc_stores: try: - logger.info(f"##### Start querying run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### ") + logger.info("##### Start querying run: %s, %s, %s docs ##### ", retriever_name, doc_store_name, n_docs) if retriever_name in ["elastic", "sentence_transformers"]: similarity = "cosine" else: @@ -273,7 +273,7 @@ def add_precomputed_embeddings(embeddings_dir, embeddings_filenames, docs): ret = [] id_to_doc = {x.meta["passage_id"]: x for x in docs} for ef in embeddings_filenames: - logger.info(f"Adding precomputed embeddings from {embeddings_dir + ef}") + logger.info("Adding precomputed embeddings from %s", embeddings_dir + ef) filename = embeddings_dir + ef embeds = pickle.load(open(filename, "rb")) for i, vec in embeds: @@ -284,7 +284,7 @@ def add_precomputed_embeddings(embeddings_dir, embeddings_filenames, docs): # In the official DPR repo, there are only 20594995 precomputed embeddings for 21015324 wikipedia passages # If there isn't an embedding for a given doc, we remove it here ret = [x for x in ret if x.embedding is not None] - logger.info(f"Embeddings loaded for {len(ret)}/{len(docs)} docs") + logger.info("Embeddings loaded for %s/%s docs", len(ret), len(docs)) return ret diff --git a/test/benchmarks/utils.py b/test/benchmarks/utils.py index ef9423bae..caaf992a3 100644 --- a/test/benchmarks/utils.py +++ b/test/benchmarks/utils.py @@ -170,16 +170,16 @@ def download_from_url(url: str, filepath: Union[str, Path]): :return: local path of the downloaded file """ - logger.info(f"Downloading {url}") + logger.info("Downloading %s", url) # Create local folder folder, filename = os.path.split(filepath) if not os.path.exists(folder): os.makedirs(folder) # Download file if not present locally if os.path.exists(filepath): - logger.info(f"Skipping {url} (exists locally)") + logger.info("Skipping %s (exists locally)", url) else: - logger.info(f"Downloading {url} to {filepath} ") + logger.info("Downloading %s to %s", filepath) with open(filepath, "wb") as file: http_get(url=url, temp_file=file) return filepath diff --git a/test/conftest.py b/test/conftest.py index 6b346dbb9..615ff9edd 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1024,7 +1024,7 @@ def adaptive_model_qa(num_processes): current_process = psutil.Process() children = current_process.children() if len(children) != 0: - logging.error(f"Not all the subprocesses are closed! {len(children)} are still running.") + logging.error("Not all the subprocesses are closed! %s are still running.", len(children)) @pytest.fixture diff --git a/test/mocks/pinecone.py b/test/mocks/pinecone.py index 388a08850..b0b3231b4 100644 --- a/test/mocks/pinecone.py +++ b/test/mocks/pinecone.py @@ -132,7 +132,7 @@ class Index: response: dict = {"namespace": namespace, "vectors": {}} if namespace not in self.index_config.namespaces: # If we query an empty/non-existent namespace, Pinecone will just return an empty response - logger.warning(f"No namespace called '{namespace}'") + logger.warning("No namespace called '%s'", namespace) return response records = self.index_config.namespaces[namespace] namespace_ids = records.keys() diff --git a/test/pipelines/test_pipeline.py b/test/pipelines/test_pipeline.py index 0d3e09eab..c044a118c 100644 --- a/test/pipelines/test_pipeline.py +++ b/test/pipelines/test_pipeline.py @@ -56,11 +56,11 @@ def reduce_windows_recursion_limit(): default_recursion_limit = sys.getrecursionlimit() if is_windows: reduced_recursion_limit = default_recursion_limit // 2 - logger.warning(f"Reducing recursion limit to {reduced_recursion_limit}") + logger.warning("Reducing recursion limit to %s", reduced_recursion_limit) sys.setrecursionlimit(reduced_recursion_limit) yield if is_windows: - logger.warning(f"Resetting recursion limit to {default_recursion_limit}") + logger.warning("Resetting recursion limit to %s", default_recursion_limit) sys.setrecursionlimit(default_recursion_limit) diff --git a/tutorials/Tutorial5_Evaluation.py b/tutorials/Tutorial5_Evaluation.py index 24acfff19..43c4f01fa 100644 --- a/tutorials/Tutorial5_Evaluation.py +++ b/tutorials/Tutorial5_Evaluation.py @@ -30,9 +30,9 @@ def tutorial5_evaluation(): doc_index = "tutorial5_docs" label_index = "tutorial5_labels" - ############################################## + ############################################### # Code - ############################################## + ############################################### launch_es() # Download evaluation data, which is a subset of Natural Questions development set containing 50 documents with one question per document and multiple annotated answers