mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-06 03:57:19 +00:00
chore: remove f-strings from logs for performance reasons (#3212)
* Use the %s syntax on all debug messages * Use the %s syntax on some more debug messages * Use the %s syntax on info messages * Use the %s syntax on warning messages * Use the %s syntax on error and exception messages * mypy * pylint * trogger tutorials execution in CI * trigger tutorials execution on CI * black * remove embeddings from repr * fix Document `__repr__` * address feedback * mypy
This commit is contained in:
parent
8fbccbda82
commit
dcb132ba59
@ -29,7 +29,7 @@ def disable_and_log(func):
|
||||
|
||||
args_as_kwargs = args_to_kwargs(args, func)
|
||||
parameters = {**args_as_kwargs, **kwargs}
|
||||
logger.info(f"Input to {func.__name__}: {parameters}")
|
||||
logger.info("Input to %s: %s", func.__name__, parameters)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
@ -200,7 +200,7 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
"""
|
||||
# Check if index_name refers to an alias
|
||||
if self.client.indices.exists_alias(name=index_name):
|
||||
logger.debug(f"Index name {index_name} is an alias.")
|
||||
logger.debug("Index name %s is an alias.", index_name)
|
||||
|
||||
# check if the existing index has the embedding field; if not create it
|
||||
if self.client.indices.exists(index=index_name, headers=headers):
|
||||
@ -949,7 +949,7 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
all_terms_must_match=all_terms_must_match,
|
||||
)
|
||||
|
||||
logger.debug(f"Retriever query: {body}")
|
||||
logger.debug("Retriever query: %s", body)
|
||||
result = self.client.search(index=index, body=body, headers=headers)["hits"]["hits"]
|
||||
|
||||
documents = [
|
||||
@ -1086,7 +1086,7 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
body.append(headers)
|
||||
body.append(cur_query_body)
|
||||
|
||||
logger.debug(f"Retriever query: {body}")
|
||||
logger.debug("Retriever query: %s", body)
|
||||
responses = self.client.msearch(index=index, body=body)
|
||||
|
||||
all_documents = []
|
||||
@ -1287,7 +1287,7 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
if excluded_meta_data:
|
||||
body["_source"] = {"excludes": excluded_meta_data}
|
||||
|
||||
logger.debug(f"Retriever query: {body}")
|
||||
logger.debug("Retriever query: %s", body)
|
||||
try:
|
||||
result = self.client.search(index=index, body=body, request_timeout=300, headers=headers)["hits"]["hits"]
|
||||
if len(result) == 0:
|
||||
@ -1458,12 +1458,16 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
|
||||
if update_existing_embeddings:
|
||||
document_count = self.get_document_count(index=index, headers=headers)
|
||||
logger.info(f"Updating embeddings for all {document_count} docs ...")
|
||||
else:
|
||||
document_count = self.get_document_count(
|
||||
index=index, filters=filters, only_documents_without_embedding=True, headers=headers
|
||||
)
|
||||
logger.info(f"Updating embeddings for {document_count} docs without embeddings ...")
|
||||
|
||||
logger.info(
|
||||
"Updating embeddings for all %s docs %s...",
|
||||
document_count,
|
||||
"without embeddings" if not update_existing_embeddings else "",
|
||||
)
|
||||
|
||||
result = self._get_all_documents_in_index(
|
||||
index=index,
|
||||
@ -1674,7 +1678,7 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
def _delete_index(self, index: str):
|
||||
if self.client.indices.exists(index):
|
||||
self.client.indices.delete(index=index, ignore=[400, 404])
|
||||
logger.info(f"Index '{index}' deleted.")
|
||||
logger.info("Index '%s' deleted.", index)
|
||||
|
||||
|
||||
class ElasticsearchDocumentStore(BaseElasticsearchDocumentStore):
|
||||
|
||||
@ -346,7 +346,7 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
logger.warning("Calling DocumentStore.update_embeddings() on an empty index")
|
||||
return
|
||||
|
||||
logger.info(f"Updating embeddings for {document_count} docs...")
|
||||
logger.info("Updating embeddings for %s docs...", document_count)
|
||||
vector_id = sum(index.ntotal for index in self.faiss_indexes.values())
|
||||
|
||||
result = self._query(
|
||||
@ -568,7 +568,7 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
)
|
||||
if index in self.faiss_indexes:
|
||||
del self.faiss_indexes[index]
|
||||
logger.info(f"Index '{index}' deleted.")
|
||||
logger.info("Index '%s' deleted.", index)
|
||||
super().delete_index(index)
|
||||
|
||||
def query_by_embedding(
|
||||
|
||||
@ -451,11 +451,10 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
result = self._query(
|
||||
index=index, filters=filters, only_documents_without_embedding=not update_existing_embeddings
|
||||
)
|
||||
document_count = len(result)
|
||||
logger.info(f"Updating embeddings for {document_count} docs ...")
|
||||
logger.info("Updating embeddings for %s docs ...", len(result) if logger.level > logging.DEBUG else 0)
|
||||
batched_documents = get_batches_from_generator(result, batch_size)
|
||||
with tqdm(
|
||||
total=document_count, disable=not self.progress_bar, position=0, unit=" docs", desc="Updating Embedding"
|
||||
total=len(result), disable=not self.progress_bar, position=0, unit=" docs", desc="Updating Embedding"
|
||||
) as progress_bar:
|
||||
for document_batch in batched_documents:
|
||||
embeddings = retriever.embed_documents(document_batch) # type: ignore
|
||||
@ -782,7 +781,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
"""
|
||||
if index in self.indexes:
|
||||
del self.indexes[index]
|
||||
logger.info(f"Index '{index}' deleted.")
|
||||
logger.info("Index '%s' deleted.", index)
|
||||
|
||||
def delete_labels(
|
||||
self,
|
||||
|
||||
@ -37,7 +37,7 @@ class InMemoryKnowledgeGraph(BaseKnowledgeGraph):
|
||||
if index not in self.indexes:
|
||||
self.indexes[index] = Graph()
|
||||
else:
|
||||
logger.warning(f"Index '{index}' is already present.")
|
||||
logger.warning("Index '%s' is already present.", index)
|
||||
|
||||
def delete_index(self, index: Optional[str] = None):
|
||||
"""
|
||||
@ -49,7 +49,7 @@ class InMemoryKnowledgeGraph(BaseKnowledgeGraph):
|
||||
|
||||
if index in self.indexes:
|
||||
del self.indexes[index]
|
||||
logger.info(f"Index '{index}' deleted.")
|
||||
logger.info("Index '%s' deleted.", index)
|
||||
|
||||
def import_from_ttl_file(self, path: Path, index: Optional[str] = None):
|
||||
"""
|
||||
|
||||
@ -313,13 +313,6 @@ class Milvus1DocumentStore(SQLDocumentStore):
|
||||
index = index or self.index
|
||||
self._create_collection_and_index_if_not_exist(index)
|
||||
|
||||
document_count = self.get_document_count(index=index)
|
||||
if document_count == 0:
|
||||
logger.warning("Calling DocumentStore.update_embeddings() on an empty index")
|
||||
return
|
||||
|
||||
logger.info(f"Updating embeddings for {document_count} docs...")
|
||||
|
||||
result = self._query(
|
||||
index=index,
|
||||
vector_ids=None,
|
||||
@ -327,6 +320,14 @@ class Milvus1DocumentStore(SQLDocumentStore):
|
||||
filters=filters,
|
||||
only_documents_without_embedding=not update_existing_embeddings,
|
||||
)
|
||||
|
||||
document_count = len(result)
|
||||
if document_count == 0:
|
||||
logger.warning("Calling DocumentStore.update_embeddings() on an empty index")
|
||||
return
|
||||
|
||||
logger.info("Updating embeddings for %s docs...", document_count)
|
||||
|
||||
batched_documents = get_batches_from_generator(result, batch_size)
|
||||
with tqdm(
|
||||
total=document_count, disable=not self.progress_bar, position=0, unit=" docs", desc="Updating Embedding"
|
||||
@ -665,10 +666,10 @@ class Milvus1DocumentStore(SQLDocumentStore):
|
||||
index = index or self.index
|
||||
status, collection_info = self.milvus_server.get_collection_stats(collection_name=index)
|
||||
if not status.OK():
|
||||
logger.info(f"Failed fetch stats from store ...")
|
||||
logger.info("Failed fetch stats from store ...")
|
||||
return list()
|
||||
|
||||
logger.debug(f"collection_info = {collection_info}")
|
||||
logger.debug("collection_info = %s", collection_info)
|
||||
|
||||
ids = list()
|
||||
partition_list = collection_info["partitions"]
|
||||
@ -679,16 +680,16 @@ class Milvus1DocumentStore(SQLDocumentStore):
|
||||
status, id_list = self.milvus_server.list_id_in_segment(
|
||||
collection_name=index, segment_name=segment_name
|
||||
)
|
||||
logger.debug(f"{status}: segment {segment_name} has {len(id_list)} vectors ...")
|
||||
logger.debug("%s: segment %s has %s vectors ...", status, segment_name, len(id_list))
|
||||
ids.extend(id_list)
|
||||
|
||||
if len(ids) == 0:
|
||||
logger.info(f"No documents in the store ...")
|
||||
logger.info("No documents in the store ...")
|
||||
return list()
|
||||
|
||||
status, vectors = self.milvus_server.get_entity_by_id(collection_name=index, ids=ids)
|
||||
if not status.OK():
|
||||
logger.info(f"Failed fetch document for ids {ids} from store ...")
|
||||
logger.info("Failed fetch document for ids %s from store ...", ids)
|
||||
return list()
|
||||
|
||||
return vectors
|
||||
|
||||
@ -204,7 +204,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
|
||||
|
||||
for field in custom_fields:
|
||||
if field.name == self.id_field or field.name == self.embedding_field:
|
||||
logger.warning(f"Skipping `{field.name}` as it is similar to `id_field` or `embedding_field`")
|
||||
logger.warning("Skipping '%s' as it is similar to 'id_field' or 'embedding_field'", field.name)
|
||||
else:
|
||||
fields.append(field)
|
||||
|
||||
@ -353,7 +353,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
|
||||
logger.warning("Calling DocumentStore.update_embeddings() on an empty index")
|
||||
return
|
||||
|
||||
logger.info(f"Updating embeddings for {document_count} docs...")
|
||||
logger.info("Updating embeddings for %s docs...", document_count)
|
||||
|
||||
result = self._query(
|
||||
index=index,
|
||||
@ -516,7 +516,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
|
||||
def _delete_index(self, index: str):
|
||||
if utility.has_collection(collection_name=index):
|
||||
utility.drop_collection(collection_name=index)
|
||||
logger.info(f"Index '{index}' deleted.")
|
||||
logger.info("Index '%s' deleted.", index)
|
||||
super().delete_index(index)
|
||||
|
||||
def get_all_documents_generator(
|
||||
|
||||
@ -379,7 +379,7 @@ class OpenSearchDocumentStore(BaseElasticsearchDocumentStore):
|
||||
if excluded_meta_data:
|
||||
body["_source"] = {"excludes": excluded_meta_data}
|
||||
|
||||
logger.debug(f"Retriever query: {body}")
|
||||
logger.debug("Retriever query: %s", body)
|
||||
result = self.client.search(index=index, body=body, request_timeout=300, headers=headers)["hits"]["hits"]
|
||||
|
||||
documents = [
|
||||
@ -396,7 +396,7 @@ class OpenSearchDocumentStore(BaseElasticsearchDocumentStore):
|
||||
"""
|
||||
# Check if index_name refers to an alias
|
||||
if self.client.indices.exists_alias(name=index_name):
|
||||
logger.debug(f"Index name {index_name} is an alias.")
|
||||
logger.debug("Index name %s is an alias.", index_name)
|
||||
|
||||
# check if the existing index has the embedding field; if not create it
|
||||
if self.client.indices.exists(index=index_name, headers=headers):
|
||||
|
||||
@ -216,7 +216,7 @@ class PineconeDocumentStore(BaseDocumentStore):
|
||||
stats = index_connection.describe_index_stats()
|
||||
dims = stats["dimension"]
|
||||
count = stats["namespaces"][""]["vector_count"] if stats["namespaces"].get("") else 0
|
||||
logger.info(f"Index statistics: name: {index}, embedding dimensions: {dims}, record count: {count}")
|
||||
logger.info("Index statistics: name: %s embedding dimensions: %s, record count: %s", index, dims, count)
|
||||
# return index connection
|
||||
return index_connection
|
||||
|
||||
@ -471,7 +471,7 @@ class PineconeDocumentStore(BaseDocumentStore):
|
||||
logger.warning("Calling DocumentStore.update_embeddings() on an empty index")
|
||||
return
|
||||
|
||||
logger.info(f"Updating embeddings for {document_count} docs...")
|
||||
logger.info("Updating embeddings for %s docs...", document_count)
|
||||
|
||||
# If the embedding namespace is empty or the user does not want to update existing embeddings, we use document namespace
|
||||
if self.get_embedding_count(index=index) == 0 or not update_existing_embeddings:
|
||||
@ -1003,7 +1003,7 @@ class PineconeDocumentStore(BaseDocumentStore):
|
||||
index = self._index_name(index)
|
||||
if index in pinecone.list_indexes():
|
||||
pinecone.delete_index(index)
|
||||
logger.info(f"Index '{index}' deleted.")
|
||||
logger.info("Index '%s' deleted.", index)
|
||||
if index in self.pinecone_indexes:
|
||||
del self.pinecone_indexes[index]
|
||||
if index in self.all_ids:
|
||||
|
||||
@ -402,7 +402,7 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
try:
|
||||
meta_orms.append(MetaDocumentORM(name=key, value=value))
|
||||
except TypeError as ex:
|
||||
logger.error(f"Document {doc.id} - {ex}")
|
||||
logger.error("Document %s - %s", doc.id, ex)
|
||||
doc_mapping = {
|
||||
"id": doc.id,
|
||||
"content": doc.to_dict()["content"],
|
||||
@ -425,7 +425,7 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
try:
|
||||
self.session.commit()
|
||||
except Exception as ex:
|
||||
logger.error(f"Transaction rollback: {ex.__cause__}")
|
||||
logger.error("Transaction rollback: %s", ex.__cause__)
|
||||
# Rollback is important here otherwise self.session will be in inconsistent state and next call will fail
|
||||
self.session.rollback()
|
||||
raise ex
|
||||
@ -495,7 +495,7 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
try:
|
||||
self.session.commit()
|
||||
except Exception as ex:
|
||||
logger.error(f"Transaction rollback: {ex.__cause__}")
|
||||
logger.error("Transaction rollback: %s", ex.__cause__)
|
||||
self.session.rollback()
|
||||
raise ex
|
||||
|
||||
|
||||
@ -34,7 +34,7 @@ def eval_data_from_json(
|
||||
with open(filename, "r", encoding="utf-8") as file:
|
||||
data = json.load(file)
|
||||
if "title" not in data["data"][0]:
|
||||
logger.warning(f"No title information found for documents in QA file: {filename}")
|
||||
logger.warning("No title information found for documents in QA file: %s", filename)
|
||||
|
||||
for squad_document in data["data"]:
|
||||
if max_docs:
|
||||
|
||||
@ -313,7 +313,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
try:
|
||||
result = self.weaviate_client.data_object.get_by_id(id, with_vector=True)
|
||||
except weaviate.exceptions.UnexpectedStatusCodeException as usce:
|
||||
logging.debug(f"Weaviate could not get the document requested: {usce}")
|
||||
logging.debug("Weaviate could not get the document requested: %s", usce)
|
||||
if result:
|
||||
document = self._convert_weaviate_result_to_document(result, return_embedding=True)
|
||||
return document
|
||||
@ -340,7 +340,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
try:
|
||||
result = self.weaviate_client.data_object.get_by_id(id, with_vector=True)
|
||||
except weaviate.exceptions.UnexpectedStatusCodeException as usce:
|
||||
logging.debug(f"Weaviate could not get the document requested: {usce}")
|
||||
logging.debug("Weaviate could not get the document requested: %s", usce)
|
||||
if result:
|
||||
document = self._convert_weaviate_result_to_document(result, return_embedding=True)
|
||||
documents.append(document)
|
||||
@ -560,7 +560,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
and "error" in result["result"]["errors"]
|
||||
):
|
||||
for message in result["result"]["errors"]["error"]:
|
||||
logger.error(f"{message['message']}")
|
||||
logger.error(message["message"])
|
||||
progress_bar.update(batch_size)
|
||||
progress_bar.close()
|
||||
|
||||
@ -1215,7 +1215,10 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
raise RuntimeError("Specify the arg `embedding_field` when initializing WeaviateDocumentStore()")
|
||||
|
||||
if update_existing_embeddings:
|
||||
logger.info(f"Updating embeddings for all {self.get_document_count(index=index)} docs ...")
|
||||
logger.info(
|
||||
"Updating embeddings for all %s docs ...",
|
||||
self.get_document_count(index=index) if logger.level > logging.DEBUG else 0,
|
||||
)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"All the documents in Weaviate store have an embedding by default. Only update is allowed!"
|
||||
@ -1379,7 +1382,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
index = self._sanitize_index_name(index) or index
|
||||
if any(c for c in self.weaviate_client.schema.get()["classes"] if c["class"] == index):
|
||||
self.weaviate_client.schema.delete_class(index)
|
||||
logger.info(f"Index '{index}' deleted.")
|
||||
logger.info("Index '%s' deleted.", index)
|
||||
|
||||
def delete_labels(self):
|
||||
"""
|
||||
|
||||
@ -221,7 +221,7 @@ class DataSilo:
|
||||
"""
|
||||
Load serialized dataset from a cache.
|
||||
"""
|
||||
logger.info(f"Loading datasets from cache at {cache_dir}")
|
||||
logger.info("Loading datasets from cache at %s", cache_dir)
|
||||
self.data["train"] = torch.load(cache_dir / "train_dataset")
|
||||
|
||||
dev_dataset_path = cache_dir / "dev_dataset"
|
||||
@ -277,7 +277,7 @@ class DataSilo:
|
||||
torch.save(self.data["test"], cache_dir / "test_dataset")
|
||||
|
||||
torch.save(self.tensor_names, cache_dir / "tensor_names")
|
||||
logger.info(f"Cached the datasets at {cache_dir}")
|
||||
logger.info("Cached the datasets at %s", cache_dir)
|
||||
|
||||
def _initialize_data_loaders(self):
|
||||
"""
|
||||
|
||||
@ -63,7 +63,7 @@ def convert_features_to_dataset(features):
|
||||
)
|
||||
except:
|
||||
logger.debug(
|
||||
f"Could not determine type for feature '{t_name}'. " "Converting now to a tensor of default type long."
|
||||
"Could not determine type for feature '%s'. Converting now to a tensor of default type long.", t_name
|
||||
)
|
||||
|
||||
# Convert all remaining python objects to torch long tensors
|
||||
|
||||
@ -147,7 +147,7 @@ class Processor(ABC):
|
||||
sig = signature(cls.subclasses[processor_name])
|
||||
unused_args = {k: v for k, v in kwargs.items() if k not in sig.parameters}
|
||||
logger.debug(
|
||||
f"Got more parameters than needed for loading {processor_name}: {unused_args}. " f"Those won't be used!"
|
||||
"Got more parameters than needed for loading %s: %s. Those won't be used!", processor_name, unused_args
|
||||
)
|
||||
processor = cls.subclasses[processor_name](
|
||||
data_dir=data_dir,
|
||||
@ -324,7 +324,9 @@ class Processor(ABC):
|
||||
if problematic_sample_ids:
|
||||
n_problematic = len(problematic_sample_ids)
|
||||
problematic_id_str = ", ".join([str(i) for i in problematic_sample_ids])
|
||||
logger.error(f"Unable to convert {n_problematic} samples to features. Their ids are : {problematic_id_str}")
|
||||
logger.error(
|
||||
"Unable to convert %s samples to features. Their ids are : %s", n_problematic, problematic_id_str
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _check_sample_features(basket: SampleBasket):
|
||||
@ -348,7 +350,7 @@ class Processor(ABC):
|
||||
return True
|
||||
|
||||
def _log_samples(self, n_samples: int, baskets: List[SampleBasket]):
|
||||
logger.debug("*** Show {} random examples ***".format(n_samples))
|
||||
logger.debug("*** Show %s random examples ***", n_samples)
|
||||
if len(baskets) == 0:
|
||||
logger.debug("*** No samples to show because there are no baskets ***")
|
||||
return
|
||||
@ -1817,7 +1819,7 @@ class TextClassificationProcessor(Processor):
|
||||
self.header = header
|
||||
self.max_samples = max_samples
|
||||
self.dev_stratification = dev_stratification
|
||||
logger.debug(f"Currently no support in Processor for returning problematic ids")
|
||||
logger.debug("Currently no support in Processor for returning problematic ids")
|
||||
|
||||
super(TextClassificationProcessor, self).__init__(
|
||||
tokenizer=tokenizer,
|
||||
@ -2141,14 +2143,14 @@ def write_squad_predictions(predictions, out_filename, predictions_filename=None
|
||||
dev_labels[q["id"]] = q["answers"][0]["text"]
|
||||
not_included = set(list(dev_labels.keys())) - set(list(predictions_json.keys()))
|
||||
if len(not_included) > 0:
|
||||
logger.info(f"There were missing predicitons for question ids: {list(not_included)}")
|
||||
logger.info("There were missing predicitons for question ids: %s", list(not_included))
|
||||
for x in not_included:
|
||||
predictions_json[x] = ""
|
||||
|
||||
# os.makedirs("model_output", exist_ok=True)
|
||||
# filepath = Path("model_output") / out_filename
|
||||
json.dump(predictions_json, open(out_filename, "w"))
|
||||
logger.info(f"Written Squad predictions to: {out_filename}")
|
||||
logger.info("Written Squad predictions to: %s", out_filename)
|
||||
|
||||
|
||||
def _read_dpr_json(
|
||||
@ -2188,7 +2190,7 @@ def _read_dpr_json(
|
||||
"""
|
||||
# get remote dataset if needed
|
||||
if not os.path.exists(file):
|
||||
logger.info(f" Couldn't find {file} locally. Trying to download ...")
|
||||
logger.info("Couldn't find %s locally. Trying to download ...", file)
|
||||
_download_extract_downstream_data(file, proxies=proxies)
|
||||
|
||||
if Path(file).suffix.lower() == ".jsonl":
|
||||
@ -2250,7 +2252,7 @@ def _read_dpr_json(
|
||||
def _read_squad_file(filename: str, proxies=None):
|
||||
"""Read a SQuAD json file"""
|
||||
if not os.path.exists(filename):
|
||||
logger.info(f" Couldn't find {filename} locally. Trying to download ...")
|
||||
logger.info("Couldn't find %s locally. Trying to download ...", filename)
|
||||
_download_extract_downstream_data(filename, proxies)
|
||||
with open(filename, "r", encoding="utf-8") as reader:
|
||||
input_data = json.load(reader)["data"]
|
||||
|
||||
@ -112,10 +112,10 @@ class Evaluator:
|
||||
for head_num, head in enumerate(model.prediction_heads):
|
||||
if head.model_type == "span_classification" and calibrate_conf_scores:
|
||||
temperature_previous = head.temperature_for_confidence.item()
|
||||
logger.info(f"temperature used for confidence scores before calibration: {temperature_previous}")
|
||||
logger.info("temperature used for confidence scores before calibration: %s", temperature_previous)
|
||||
head.calibrate_conf(logits_all[head_num], label_all[head_num])
|
||||
temperature_current = head.temperature_for_confidence.item()
|
||||
logger.info(f"temperature used for confidence scores after calibration: {temperature_current}")
|
||||
logger.info("temperature used for confidence scores after calibration: %s", temperature_current)
|
||||
temperature_change = (abs(temperature_current - temperature_previous) / temperature_previous) * 100.0
|
||||
if temperature_change > 50:
|
||||
logger.warning(
|
||||
|
||||
@ -283,7 +283,7 @@ class Inferencer:
|
||||
else:
|
||||
num_processes = mp.cpu_count()
|
||||
self.process_pool = mp.Pool(processes=num_processes)
|
||||
logger.info(f"Got ya {num_processes} parallel workers to do inference ...")
|
||||
logger.info("Got ya %s parallel workers to do inference ...", num_processes)
|
||||
log_ascii_workers(n=num_processes, logger=logger)
|
||||
|
||||
def close_multiprocessing_pool(self, join: bool = False):
|
||||
|
||||
@ -142,7 +142,7 @@ class BaseAdaptiveModel:
|
||||
"does not currently support saving and loading"
|
||||
)
|
||||
assert len(model_files) == len(config_files), error_str
|
||||
logger.info(f"Found files for loading {len(model_files)} prediction heads")
|
||||
logger.info("Found files for loading %s prediction heads", len(model_files))
|
||||
|
||||
return model_files, config_files
|
||||
|
||||
@ -573,7 +573,7 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
|
||||
try:
|
||||
tracker.track_params(params)
|
||||
except Exception as e:
|
||||
logger.warning(f"ML logging didn't work: {e}")
|
||||
logger.warning("ML logging didn't work: %s", e)
|
||||
|
||||
def verify_vocab_size(self, vocab_size: int):
|
||||
"""
|
||||
|
||||
@ -376,7 +376,7 @@ class BiAdaptiveModel(nn.Module):
|
||||
try:
|
||||
tracker.track_params(params)
|
||||
except Exception as e:
|
||||
logger.warning(f"ML logging didn't work: {e}")
|
||||
logger.warning("ML logging didn't work: %s", e)
|
||||
|
||||
def verify_vocab_size(self, vocab_size1: int, vocab_size2: int):
|
||||
"""
|
||||
|
||||
@ -488,7 +488,7 @@ class HFLanguageModelNoSegmentIds(HFLanguageModelWithPooler):
|
||||
specified using the arguments `output_hidden_states` and `output_attentions`.
|
||||
"""
|
||||
if segment_ids is not None:
|
||||
logging.warning(f"`segment_ids` is not None, but {self.name} does not use them. They will be ignored.")
|
||||
logger.warning(f"'segment_ids' is not None, but %s does not use them. They will be ignored.", self.name)
|
||||
|
||||
return super().forward(
|
||||
input_ids=input_ids,
|
||||
@ -906,7 +906,7 @@ def get_language_model(
|
||||
"Ensure that the model class name can be inferred from the directory name "
|
||||
"when loading a Transformers model."
|
||||
)
|
||||
logger.error(f"Using the AutoModel class for '{pretrained_model_name_or_path}'. This can cause crashes!")
|
||||
logger.error("Using the AutoModel class for '%s'. This can cause crashes!", pretrained_model_name_or_path)
|
||||
model_type = "Auto"
|
||||
|
||||
# Find the class corresponding to this model type
|
||||
@ -918,7 +918,9 @@ def get_language_model(
|
||||
f"Supported model types are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}"
|
||||
)
|
||||
|
||||
logger.info(f" * LOADING MODEL: '{pretrained_model_name_or_path}' {'(' + model_type + ')' if model_type else ''}")
|
||||
logger.info(
|
||||
" * LOADING MODEL: '%s' %s", pretrained_model_name_or_path, "(" + model_type + ")" if model_type else ""
|
||||
)
|
||||
|
||||
# Instantiate the class for this model
|
||||
language_model = language_model_class(
|
||||
@ -930,8 +932,10 @@ def get_language_model(
|
||||
model_kwargs=model_kwargs,
|
||||
)
|
||||
logger.info(
|
||||
f"Loaded '{pretrained_model_name_or_path}' ({model_type} model) "
|
||||
f"from {'local file system' if config_file_exists else 'model hub'}."
|
||||
"Loaded '%s' (%s model) from %s.",
|
||||
pretrained_model_name_or_path,
|
||||
model_type,
|
||||
"local file system" if config_file_exists else "model hub",
|
||||
)
|
||||
return language_model
|
||||
|
||||
@ -963,7 +967,7 @@ def _get_model_type(
|
||||
model_type = config.architectures[0] if is_supported_model(config.architectures[0]) else None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"AutoConfig failed to load on '{model_name_or_path}': {str(e)}")
|
||||
logger.error("AutoConfig failed to load on '%s': %s", model_name_or_path, e)
|
||||
|
||||
if not model_type:
|
||||
logger.warning("Could not infer the model type from its config. Looking for clues in the model name.")
|
||||
@ -991,5 +995,5 @@ def _guess_language(name: str) -> str:
|
||||
language = languages[0]
|
||||
else:
|
||||
language = "english"
|
||||
logger.info(f"Auto-detected model language: {language}")
|
||||
logger.info("Auto-detected model language: %s", language)
|
||||
return language
|
||||
|
||||
@ -188,7 +188,7 @@ def _get_optim(model, opts: Dict):
|
||||
optimizer_name = opts.pop("name", None)
|
||||
|
||||
# Logging
|
||||
logger.info(f"Loading optimizer `{optimizer_name}`: '{opts}'")
|
||||
logger.info("Loading optimizer '%s': %s", optimizer_name, opts)
|
||||
tracker.track_params(opts)
|
||||
tracker.track_params({"optimizer_name": optimizer_name})
|
||||
|
||||
@ -271,7 +271,7 @@ def get_scheduler(optimizer, opts):
|
||||
except AttributeError:
|
||||
raise AttributeError(f"Scheduler '{schedule_name}' not found in 'torch' or 'transformers'")
|
||||
|
||||
logger.info(f"Using scheduler '{schedule_name}'")
|
||||
logger.info("Using scheduler '%s'", schedule_name)
|
||||
|
||||
# get supported args of constructor
|
||||
allowed_args = inspect.signature(sched_constructor).parameters.keys()
|
||||
@ -285,7 +285,7 @@ def get_scheduler(optimizer, opts):
|
||||
constructor_opts = {k: v for k, v in opts.items() if k in allowed_args}
|
||||
|
||||
# Logging
|
||||
logger.info(f"Loading schedule `{schedule_name}`: '{constructor_opts}'")
|
||||
logger.info("Loading schedule '%s': '%s'", schedule_name, constructor_opts)
|
||||
tracker.track_params(constructor_opts)
|
||||
tracker.track_params({"schedule_name": schedule_name})
|
||||
|
||||
|
||||
@ -265,7 +265,7 @@ class QuestionAnsweringHead(PredictionHead):
|
||||
self.layer_dims = layer_dims
|
||||
assert self.layer_dims[-1] == 2
|
||||
self.feed_forward = FeedForwardBlock(self.layer_dims)
|
||||
logger.debug(f"Prediction head initialized with size {self.layer_dims}")
|
||||
logger.debug("Prediction head initialized with size %s", self.layer_dims)
|
||||
self.num_labels = self.layer_dims[-1]
|
||||
self.ph_output_type = "per_token_squad"
|
||||
self.model_type = "span_classification" # predicts start and end token of answer
|
||||
|
||||
@ -409,7 +409,7 @@ class TriAdaptiveModel(nn.Module):
|
||||
try:
|
||||
tracker.track_params(params)
|
||||
except Exception as e:
|
||||
logger.warning(f"ML logging didn't work: {e}")
|
||||
logger.warning("ML logging didn't work: %s", e)
|
||||
|
||||
def verify_vocab_size(self, vocab_size1: int, vocab_size2: int, vocab_size3: int):
|
||||
"""Verifies that the model fits to the tokenizer vocabulary.
|
||||
|
||||
@ -192,9 +192,9 @@ class Trainer:
|
||||
# when resuming training from a checkpoint, we want to fast forward to the step of the checkpoint
|
||||
if resume_from_step and step <= resume_from_step:
|
||||
if step % 10000 == 0:
|
||||
logger.info(f"Skipping {step} out of {resume_from_step} steps ...")
|
||||
logger.info("Skipping %s out of %s steps ...", step, resume_from_step)
|
||||
if resume_from_step == step:
|
||||
logger.info(f"Finished skipping {resume_from_step} steps ...")
|
||||
logger.info("Finished skipping %s steps ...", resume_from_step)
|
||||
resume_from_step = None
|
||||
else:
|
||||
continue
|
||||
@ -385,9 +385,9 @@ class Trainer:
|
||||
trainer = cls._load_checkpoint(
|
||||
path=checkpoint_to_load, data_silo=data_silo, model=model, optimizer=optimizer, local_rank=local_rank
|
||||
)
|
||||
logging.info(f"Resuming training from the train checkpoint at {checkpoint_to_load} ...")
|
||||
logging.info("Resuming training from the train checkpoint at %s ...", checkpoint_to_load)
|
||||
else:
|
||||
logging.info(f"No train checkpoints found. Starting a new training ...")
|
||||
logging.info("No train checkpoints found. Starting a new training ...")
|
||||
trainer = cls(
|
||||
data_silo=data_silo,
|
||||
model=model,
|
||||
@ -447,7 +447,7 @@ class Trainer:
|
||||
data_silo=data_silo, model=model, optimizer=optimizer, lr_schedule=scheduler, **trainer_state_dict
|
||||
)
|
||||
|
||||
logger.info(f"Loaded a train checkpoint from {path}")
|
||||
logger.info("Loaded a train checkpoint from %s", path)
|
||||
return trainer
|
||||
|
||||
@classmethod
|
||||
@ -516,7 +516,7 @@ class Trainer:
|
||||
for cp in saved_checkpoints[self.checkpoints_to_keep :]:
|
||||
shutil.rmtree(cp)
|
||||
|
||||
logger.info(f"Saved a training checkpoint after {checkpoint_name}")
|
||||
logger.info("Saved a training checkpoint after %s", checkpoint_name)
|
||||
|
||||
def _get_state_dict(self):
|
||||
"""
|
||||
|
||||
@ -102,8 +102,9 @@ def initialize_device_settings(
|
||||
device_to_replace = torch.device("cuda")
|
||||
devices_to_use = [torch.device("cuda:0") if device == device_to_replace else device for device in devices_to_use]
|
||||
|
||||
logger.info(f"Using devices: {', '.join([str(device) for device in devices_to_use]).upper()}")
|
||||
logger.info(f"Number of GPUs: {n_gpu}")
|
||||
logger.info(
|
||||
"Using devices: %s - Number of GPUs: %s", ", ".join([str(device) for device in devices_to_use]).upper(), n_gpu
|
||||
)
|
||||
return devices_to_use, n_gpu
|
||||
|
||||
|
||||
@ -135,7 +136,7 @@ def try_get(keys, dictionary):
|
||||
ret = ret[0]
|
||||
return ret
|
||||
except Exception as e:
|
||||
logger.warning(f"Cannot extract from dict {dictionary} with error: {e}")
|
||||
logger.warning("Cannot extract from dict %s with error: %s", dictionary, e)
|
||||
return None
|
||||
|
||||
|
||||
|
||||
@ -176,7 +176,7 @@ def create_schema_for_node_class(node_class: Type[BaseComponent]) -> Tuple[Dict[
|
||||
|
||||
node_name = getattr(node_class, "__name__")
|
||||
|
||||
logger.info(f"Creating schema for '{node_name}'")
|
||||
logger.info("Creating schema for '%s'", node_name)
|
||||
|
||||
# Read the relevant init parameters from __init__'s signature
|
||||
init_method = getattr(node_class, "__init__", None)
|
||||
@ -400,7 +400,7 @@ def inject_definition_in_schema(node_class: Type[BaseComponent], schema: Dict[st
|
||||
schema_definition, node_ref = create_schema_for_node_class(node_class)
|
||||
schema["definitions"].update(schema_definition)
|
||||
schema["properties"]["components"]["items"]["anyOf"].append(node_ref)
|
||||
logger.info(f"Added definition for {getattr(node_class, '__name__')}")
|
||||
logger.info("Added definition for %s", getattr(node_class, "__name__"))
|
||||
return schema
|
||||
|
||||
|
||||
|
||||
@ -124,7 +124,7 @@ class RAGenerator(BaseGenerator):
|
||||
|
||||
if top_k > self.num_beams:
|
||||
top_k = self.num_beams
|
||||
logger.warning(f"top_k value should not be greater than num_beams, hence setting it to {num_beams}")
|
||||
logger.warning("top_k value should not be greater than num_beams, hence setting it to %s", num_beams)
|
||||
|
||||
self.top_k = top_k
|
||||
|
||||
@ -238,7 +238,7 @@ class RAGenerator(BaseGenerator):
|
||||
|
||||
if top_k > self.num_beams:
|
||||
top_k = self.num_beams
|
||||
logger.warning(f"top_k value should not be greater than num_beams, hence setting it to {top_k}")
|
||||
logger.warning("top_k value should not be greater than num_beams, hence setting it to %s", top_k)
|
||||
|
||||
# Flatten the documents so easy to reference
|
||||
flat_docs_dict = self._flatten_docs(documents)
|
||||
@ -382,7 +382,7 @@ class Seq2SeqGenerator(BaseGenerator):
|
||||
|
||||
if top_k > self.num_beams:
|
||||
top_k = self.num_beams
|
||||
logger.warning(f"top_k value should not be greater than num_beams, hence setting it to {num_beams}")
|
||||
logger.warning("top_k value should not be greater than num_beams, hence setting it to %s", num_beams)
|
||||
|
||||
self.top_k = top_k
|
||||
|
||||
@ -434,7 +434,7 @@ class Seq2SeqGenerator(BaseGenerator):
|
||||
|
||||
if top_k > self.num_beams:
|
||||
top_k = self.num_beams
|
||||
logger.warning(f"top_k value should not be greater than num_beams, hence setting it to {top_k}")
|
||||
logger.warning("top_k value should not be greater than num_beams, hence setting it to %s", top_k)
|
||||
|
||||
converter: Callable = Seq2SeqGenerator._get_converter(self.model_name_or_path)
|
||||
if not converter:
|
||||
|
||||
@ -216,9 +216,11 @@ class Crawler(BaseComponent):
|
||||
file_paths: list = []
|
||||
is_not_empty = len(list(output_dir.rglob("*"))) > 0
|
||||
if is_not_empty and not overwrite_existing_files:
|
||||
logger.info(f"Found data stored in `{output_dir}`. Delete this first if you really want to fetch new data.")
|
||||
logger.info(
|
||||
"Found data stored in `%s`. Delete this first if you really want to fetch new data.", output_dir
|
||||
)
|
||||
else:
|
||||
logger.info(f"Fetching from {urls} to `{output_dir}`")
|
||||
logger.info("Fetching from %s to `%s`", urls, output_dir)
|
||||
|
||||
# Start by writing out the initial list of urls
|
||||
if filter_urls:
|
||||
@ -278,7 +280,7 @@ class Crawler(BaseComponent):
|
||||
) -> List[Path]:
|
||||
paths = []
|
||||
for link in urls:
|
||||
logger.info(f"writing contents from `{link}`")
|
||||
logger.info("writing contents from '%s'", link)
|
||||
self.driver.get(link)
|
||||
if loading_wait_time is not None:
|
||||
time.sleep(loading_wait_time)
|
||||
|
||||
@ -37,7 +37,7 @@ class BaseDocumentClassifier(BaseComponent):
|
||||
results = []
|
||||
|
||||
document_ids = [doc.id for doc in results]
|
||||
logger.debug(f"Classified documents with IDs: {document_ids}")
|
||||
logger.debug("Classified documents with IDs: %s", document_ids)
|
||||
|
||||
# convert back to dicts if we are in an indexing pipeline
|
||||
if root_node == "File":
|
||||
@ -54,11 +54,11 @@ class BaseDocumentClassifier(BaseComponent):
|
||||
|
||||
if isinstance(documents[0], Document):
|
||||
document_ids = [doc.id for doc in results]
|
||||
logger.debug(f"Classified documents with IDs: {document_ids}")
|
||||
logger.debug("Classified documents with IDs: %s", document_ids)
|
||||
else:
|
||||
for doc_list in results:
|
||||
document_ids = [doc.id for doc in doc_list]
|
||||
logger.debug(f"Classified documents with IDs: {document_ids}")
|
||||
logger.debug("Classified documents with IDs: %s", document_ids)
|
||||
|
||||
return output, "output_1"
|
||||
|
||||
|
||||
@ -135,7 +135,7 @@ class ImageToTextConverter(BaseConverter):
|
||||
# remove lines having > 40% of words as digits AND not ending with a period(.)
|
||||
if remove_numeric_tables:
|
||||
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
|
||||
logger.debug(f"Removing line '{line}' from file")
|
||||
logger.debug("Removing line '%s' from file", line)
|
||||
continue
|
||||
cleaned_lines.append(line)
|
||||
|
||||
|
||||
@ -140,7 +140,7 @@ class PDFToTextConverter(BaseConverter):
|
||||
# remove lines having > 40% of words as digits AND not ending with a period(.)
|
||||
if remove_numeric_tables:
|
||||
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
|
||||
logger.debug(f"Removing line '{line}' from {file_path}")
|
||||
logger.debug("Removing line '%s' from %s", line, file_path)
|
||||
continue
|
||||
cleaned_lines.append(line)
|
||||
|
||||
@ -256,7 +256,7 @@ class PDFToTextOCRConverter(BaseConverter):
|
||||
image.save(temp_img.name)
|
||||
pages.append(self.image_2_text.convert(file_path=temp_img.name)[0].content)
|
||||
except Exception as exception:
|
||||
logger.error(f"File {file_path} has an error \n {exception}")
|
||||
logger.error("File %s has an error:\n%s", file_path, exception)
|
||||
|
||||
raw_text = "\f".join(pages)
|
||||
document = Document(content=raw_text, meta=meta, id_hash_keys=id_hash_keys)
|
||||
|
||||
@ -160,7 +160,7 @@ class TikaConverter(BaseConverter):
|
||||
# remove lines having > 40% of words as digits AND not ending with a period(.)
|
||||
if remove_numeric_tables:
|
||||
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
|
||||
logger.debug(f"Removing line '{line}' from {file_path}")
|
||||
logger.debug("Removing line '%s' from %s", line, file_path)
|
||||
continue
|
||||
|
||||
cleaned_lines.append(line)
|
||||
|
||||
@ -63,7 +63,7 @@ class TextConverter(BaseConverter):
|
||||
# remove lines having > 40% of words as digits AND not ending with a period(.)
|
||||
if remove_numeric_tables:
|
||||
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
|
||||
logger.debug(f"Removing line '{line}' from {file_path}")
|
||||
logger.debug("Removing line '%s' from %s", line, file_path)
|
||||
continue
|
||||
|
||||
cleaned_lines.append(line)
|
||||
|
||||
@ -465,7 +465,7 @@ class PreProcessor(BasePreProcessor):
|
||||
found_footer = self._find_longest_common_ngram(end_of_pages)
|
||||
if found_footer:
|
||||
pages = [page.replace(found_footer, "") for page in pages]
|
||||
logger.debug(f"Removed header '{found_header}' and footer '{found_footer}' in document")
|
||||
logger.debug("Removed header '%s' and footer '%s' in document", found_header, found_footer)
|
||||
text = "\f".join(pages)
|
||||
return text
|
||||
|
||||
@ -537,10 +537,10 @@ class PreProcessor(BasePreProcessor):
|
||||
sentence_tokenizer = nltk.data.load(f"file:{str(tokenizer_model_path)}", format="pickle")
|
||||
sentences = sentence_tokenizer.tokenize(text)
|
||||
except LookupError:
|
||||
logger.exception(f"PreProcessor couldn't load sentence tokenizer from {str(tokenizer_model_path)}")
|
||||
logger.exception("PreProcessor couldn't load sentence tokenizer from %s", tokenizer_model_path)
|
||||
except (UnpicklingError, ValueError) as e:
|
||||
logger.exception(
|
||||
f"PreProcessor couldn't determine model format of sentence tokenizer at {str(tokenizer_model_path)}."
|
||||
"PreProcessor couldn't determine model format of sentence tokenizer at %s", tokenizer_model_path
|
||||
)
|
||||
if sentences:
|
||||
return sentences
|
||||
|
||||
@ -41,7 +41,7 @@ class BaseRanker(BaseComponent):
|
||||
results = []
|
||||
|
||||
document_ids = [doc.id for doc in results]
|
||||
logger.debug(f"Retrieved documents with IDs: {document_ids}")
|
||||
logger.debug("Retrieved documents with IDs: %s", document_ids)
|
||||
output = {"documents": results}
|
||||
|
||||
return output, "output_1"
|
||||
@ -59,7 +59,7 @@ class BaseRanker(BaseComponent):
|
||||
|
||||
for doc_list in results:
|
||||
document_ids = [doc.id for doc in doc_list]
|
||||
logger.debug(f"Ranked documents with IDs: {document_ids}")
|
||||
logger.debug("Ranked documents with IDs: %s", document_ids)
|
||||
|
||||
output = {"documents": results}
|
||||
|
||||
|
||||
@ -728,7 +728,7 @@ class FARMReader(BaseReader):
|
||||
|
||||
:param directory: Directory where the Reader model should be saved
|
||||
"""
|
||||
logger.info(f"Saving reader model to {directory}")
|
||||
logger.info("Saving reader model to %s", directory)
|
||||
self.inferencer.model.save(directory)
|
||||
self.inferencer.processor.save(directory)
|
||||
|
||||
@ -1026,7 +1026,7 @@ class FARMReader(BaseReader):
|
||||
aggregated_per_doc = defaultdict(list)
|
||||
for label in labels:
|
||||
if not label.document.id:
|
||||
logger.error(f"Label does not contain a document id")
|
||||
logger.error("Label does not contain a document id")
|
||||
continue
|
||||
aggregated_per_doc[label.document.id].append(label)
|
||||
|
||||
@ -1036,7 +1036,7 @@ class FARMReader(BaseReader):
|
||||
for doc_id in all_doc_ids:
|
||||
doc = document_store.get_document_by_id(doc_id, index=doc_index)
|
||||
if not doc:
|
||||
logger.error(f"Document with the ID '{doc_id}' is not present in the document store.")
|
||||
logger.error("Document with the ID '%s' is not present in the document store.", doc_id)
|
||||
continue
|
||||
d[str(doc_id)] = {"context": doc.content}
|
||||
# get all questions / answers
|
||||
@ -1047,7 +1047,7 @@ class FARMReader(BaseReader):
|
||||
for label in aggregated_per_doc[doc_id]:
|
||||
aggregation_key = (doc_id, label.query)
|
||||
if label.answer is None:
|
||||
logger.error(f"Label.answer was None, but Answer object was expected: {label} ")
|
||||
logger.error("Label.answer was None, but Answer object was expected: %s", label)
|
||||
continue
|
||||
if label.answer.offsets_in_document is None:
|
||||
logger.error(
|
||||
|
||||
@ -177,13 +177,13 @@ class TableReader(BaseReader):
|
||||
no_answer_score = 1.0
|
||||
for document in documents:
|
||||
if document.content_type != "table":
|
||||
logger.warning(f"Skipping document with id '{document.id}' in TableReader as it is not of type table.")
|
||||
logger.warning("Skipping document with id '%s' in TableReader as it is not of type table.", document.id)
|
||||
continue
|
||||
|
||||
table: pd.DataFrame = document.content
|
||||
if table.shape[0] == 0:
|
||||
logger.warning(
|
||||
f"Skipping document with id '{document.id}' in TableReader as it does not contain any rows."
|
||||
"Skipping document with id '%s' in TableReader as it does not contain any rows.", document.id
|
||||
)
|
||||
continue
|
||||
# Tokenize query and current table
|
||||
@ -656,13 +656,13 @@ class RCIReader(BaseReader):
|
||||
answers = []
|
||||
for document in documents:
|
||||
if document.content_type != "table":
|
||||
logger.warning(f"Skipping document with id '{document.id}' in RCIReader as it is not of type table.")
|
||||
logger.warning("Skipping document with id '%s' in RCIReader as it is not of type table.", document.id)
|
||||
continue
|
||||
|
||||
table: pd.DataFrame = document.content
|
||||
if table.shape[0] == 0:
|
||||
logger.warning(
|
||||
f"Skipping document with id '{document.id}' in RCIReader as it does not contain any rows."
|
||||
"Skipping document with id '%s' in RCIReader as it does not contain any rows.", document.id
|
||||
)
|
||||
continue
|
||||
table = table.astype(str)
|
||||
|
||||
@ -221,7 +221,7 @@ class _SentenceTransformersEmbeddingEncoder(_BaseEmbeddingEncoder):
|
||||
else:
|
||||
train_examples.append(InputExample(texts=texts))
|
||||
|
||||
logger.info(f"Training/adapting {self.embedding_model} with {len(train_examples)} examples")
|
||||
logger.info("Training/adapting %s with %s examples", self.embedding_model, len(train_examples))
|
||||
train_dataloader = DataLoader(train_examples, batch_size=batch_size, drop_last=True, shuffle=True)
|
||||
train_loss = st_loss.loss(self.embedding_model)
|
||||
|
||||
|
||||
@ -328,7 +328,7 @@ class BaseRetriever(BaseComponent):
|
||||
query=query, filters=filters, top_k=top_k, index=index, headers=headers, scale_score=scale_score
|
||||
)
|
||||
document_ids = [doc.id for doc in documents]
|
||||
logger.debug(f"Retrieved documents with IDs: {document_ids}")
|
||||
logger.debug("Retrieved documents with IDs: %s", document_ids)
|
||||
output = {"documents": documents}
|
||||
|
||||
return output, "output_1"
|
||||
@ -351,13 +351,13 @@ class BaseRetriever(BaseComponent):
|
||||
if not isinstance(doc, Document):
|
||||
raise HaystackError(f"doc was of type {type(doc)}, but expected a Document.")
|
||||
document_ids.append(doc.id)
|
||||
logger.debug(f"Retrieved documents with IDs: {document_ids}")
|
||||
logger.debug("Retrieved documents with IDs: %s", document_ids)
|
||||
else:
|
||||
for doc_list in documents:
|
||||
if not isinstance(doc_list, list):
|
||||
raise HaystackError(f"doc_list was of type {type(doc_list)}, but expected a list of Documents.")
|
||||
document_ids = [doc.id for doc in doc_list]
|
||||
logger.debug(f"Retrieved documents with IDs: {document_ids}")
|
||||
logger.debug("Retrieved documents with IDs: %s", document_ids)
|
||||
output = {"documents": documents}
|
||||
|
||||
return output, "output_1"
|
||||
|
||||
@ -752,7 +752,7 @@ class DensePassageRetriever(BaseRetriever):
|
||||
use_fast_tokenizers=use_fast_tokenizers,
|
||||
similarity_function=similarity_function,
|
||||
)
|
||||
logger.info(f"DPR model loaded from {load_dir}")
|
||||
logger.info("DPR model loaded from %s", load_dir)
|
||||
|
||||
return dpr
|
||||
|
||||
@ -1435,7 +1435,7 @@ class TableTextRetriever(BaseRetriever):
|
||||
use_fast_tokenizers=use_fast_tokenizers,
|
||||
similarity_function=similarity_function,
|
||||
)
|
||||
logger.info(f"TableTextRetriever model loaded from {load_dir}")
|
||||
logger.info("TableTextRetriever model loaded from %s", load_dir)
|
||||
|
||||
return mm_retriever
|
||||
|
||||
@ -1530,7 +1530,7 @@ class EmbeddingRetriever(BaseRetriever):
|
||||
else model_format
|
||||
)
|
||||
|
||||
logger.info(f"Init retriever using embeddings of model {embedding_model}")
|
||||
logger.info("Init retriever using embeddings of model %s", embedding_model)
|
||||
|
||||
if self.model_format not in _EMBEDDING_ENCODERS.keys():
|
||||
raise ValueError(f"Unknown retriever embedding model format {model_format}")
|
||||
|
||||
@ -438,7 +438,7 @@ class TfidfRetriever(BaseRetriever):
|
||||
continue
|
||||
paragraphs.append(Paragraph(document_id=doc.id, paragraph_id=p_id, content=(p,), meta=doc.meta))
|
||||
p_id += 1
|
||||
logger.info(f"Found {len(paragraphs)} candidate paragraphs from {len(documents)} docs in DB")
|
||||
logger.info("Found %s candidate paragraphs from %s docs in DB", len(paragraphs), len(documents))
|
||||
return paragraphs
|
||||
|
||||
def _calc_scores(self, queries: Union[str, List[str]]) -> List[Dict[int, float]]:
|
||||
@ -514,7 +514,9 @@ class TfidfRetriever(BaseRetriever):
|
||||
df_sliced = df_sliced[:top_k]
|
||||
|
||||
logger.debug(
|
||||
f"Identified {df_sliced.shape[0]} candidates via retriever:\n {df_sliced.to_string(col_space=10, index=False)}"
|
||||
"Identified %s candidates via retriever:\n%s",
|
||||
df_sliced.shape[0],
|
||||
df_sliced.to_string(col_space=10, index=False),
|
||||
)
|
||||
|
||||
# get actual content for the top candidates
|
||||
@ -585,8 +587,9 @@ class TfidfRetriever(BaseRetriever):
|
||||
df_sliced = self.df.loc[query_result.keys()]
|
||||
df_sliced = df_sliced[:top_k]
|
||||
logger.debug(
|
||||
f"Identified {df_sliced.shape[0]} candidates via retriever:"
|
||||
f"\n {df_sliced.to_string(col_space=10, index=False)}"
|
||||
"Identified %s candidates via retriever:\n%s",
|
||||
df_sliced.shape[0],
|
||||
df_sliced.to_string(col_space=10, index=False),
|
||||
)
|
||||
|
||||
# get actual content for the top candidates
|
||||
|
||||
@ -14,7 +14,6 @@ import json
|
||||
import inspect
|
||||
import logging
|
||||
import tempfile
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
@ -296,14 +295,14 @@ class Pipeline:
|
||||
f"Deployed pipeline configs are not allowed to be updated. Please undeploy pipeline config '{pipeline_config_name}' first."
|
||||
)
|
||||
client.update_pipeline_config(config=config, pipeline_config_name=pipeline_config_name)
|
||||
logger.info(f"Pipeline config '{pipeline_config_name}' successfully updated.")
|
||||
logger.info("Pipeline config '%s' successfully updated.", pipeline_config_name)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Pipeline config '{pipeline_config_name}' already exists. Set `overwrite=True` to overwrite pipeline config."
|
||||
)
|
||||
else:
|
||||
client.save_pipeline_config(config=config, pipeline_config_name=pipeline_config_name)
|
||||
logger.info(f"Pipeline config '{pipeline_config_name}' successfully created.")
|
||||
logger.info("Pipeline config '%s' successfully created.", pipeline_config_name)
|
||||
|
||||
@classmethod
|
||||
def deploy_on_deepset_cloud(
|
||||
@ -508,13 +507,15 @@ class Pipeline:
|
||||
predecessors = set(nx.ancestors(self.graph, node_id))
|
||||
if predecessors.isdisjoint(set(queue.keys())): # only execute if predecessor nodes are executed
|
||||
try:
|
||||
logger.debug(f"Running node `{node_id}` with input `{node_input}`")
|
||||
logger.debug("Running node '%s` with input: %s", node_id, node_input)
|
||||
node_output, stream_id = self._run_node(node_id, node_input)
|
||||
except Exception as e:
|
||||
tb = traceback.format_exc()
|
||||
# The input might be a really large object with thousands of embeddings.
|
||||
# If you really want to see it, raise the log level.
|
||||
logger.debug("Exception while running node '%s' with input %s", node_id, node_input)
|
||||
raise Exception(
|
||||
f"Exception while running node `{node_id}` with input `{node_input}`: {e}, full stack trace: {tb}"
|
||||
)
|
||||
f"Exception while running node '{node_id}': {e}\nEnable debug logging to see the data that was passed when the pipeline failed."
|
||||
) from e
|
||||
queue.pop(node_id)
|
||||
#
|
||||
if stream_id == "split":
|
||||
@ -660,14 +661,15 @@ class Pipeline:
|
||||
predecessors = set(nx.ancestors(self.graph, node_id))
|
||||
if predecessors.isdisjoint(set(queue.keys())): # only execute if predecessor nodes are executed
|
||||
try:
|
||||
logger.debug(f"Running node `{node_id}` with input `{node_input}`")
|
||||
logger.debug("Running node '%s` with input: %s", node_id, node_input)
|
||||
node_output, stream_id = self.graph.nodes[node_id]["component"]._dispatch_run_batch(**node_input)
|
||||
except Exception as e:
|
||||
tb = traceback.format_exc()
|
||||
# The input might be a really large object with thousands of embeddings.
|
||||
# If you really want to see it, raise the log level.
|
||||
logger.debug("Exception while running node '%s' with input %s", node_id, node_input)
|
||||
raise Exception(
|
||||
f"Exception while running node `{node_id}` with input `{node_input}`: {e}, "
|
||||
f"full stack trace: {tb}"
|
||||
)
|
||||
f"Exception while running node '{node_id}': {e}\nEnable debug logging to see the data that was passed when the pipeline failed."
|
||||
) from e
|
||||
queue.pop(node_id)
|
||||
|
||||
if stream_id == "split":
|
||||
@ -746,7 +748,7 @@ class Pipeline:
|
||||
|
||||
url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip"
|
||||
data_path = util.download_and_unzip(url, dataset_dir)
|
||||
logger.info(f"Dataset downloaded here: {data_path}")
|
||||
logger.info("Dataset downloaded here: %s", data_path)
|
||||
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test") # or split = "train" or "dev"
|
||||
|
||||
# check index before eval
|
||||
@ -776,11 +778,11 @@ class Pipeline:
|
||||
|
||||
# Clean up document store
|
||||
if not keep_index and document_store is not None and document_store.index is not None:
|
||||
logger.info(f"Cleaning up: deleting index '{document_store.index}'...")
|
||||
logger.info("Cleaning up: deleting index '%s' ...", document_store.index)
|
||||
document_store.delete_index(document_store.index)
|
||||
|
||||
# Evaluate your retrieval using NDCG@k, MAP@K ...
|
||||
logger.info(f"Retriever evaluation for k in: {retriever.k_values}")
|
||||
logger.info("Retriever evaluation for k in: %s", retriever.k_values)
|
||||
ndcg, map_, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
|
||||
return ndcg, map_, recall, precision
|
||||
|
||||
@ -985,10 +987,10 @@ class Pipeline:
|
||||
if not reuse_index:
|
||||
raise HaystackError(f"Index '{document_store.index}' is not empty. Please provide an empty index.")
|
||||
else:
|
||||
logger.info(f"indexing {len(corpus_file_paths)} documents...")
|
||||
logger.info("indexing %s documents...", len(corpus_file_paths))
|
||||
index_pipeline.run(file_paths=corpus_file_paths, meta=corpus_file_metas, params=index_params)
|
||||
document_count = document_store.get_document_count()
|
||||
logger.info(f"indexing {len(evaluation_set_labels)} files to {document_count} documents finished.")
|
||||
logger.info("indexing %s files to %s documents finished.", len(evaluation_set_labels), document_count)
|
||||
|
||||
tracker.track_params({"pipeline_index_document_count": document_count})
|
||||
|
||||
@ -1056,7 +1058,7 @@ class Pipeline:
|
||||
|
||||
# Clean up document store
|
||||
if not reuse_index and document_store.index is not None:
|
||||
logger.info(f"Cleaning up: deleting index '{document_store.index}'...")
|
||||
logger.info("Cleaning up: deleting index '%s'...", document_store.index)
|
||||
document_store.delete_index(document_store.index)
|
||||
|
||||
finally:
|
||||
@ -1435,7 +1437,7 @@ class Pipeline:
|
||||
for i, (query, query_labels) in enumerate(zip(queries, query_labels_per_query)):
|
||||
|
||||
if query_labels is None or query_labels.labels is None:
|
||||
logger.warning(f"There is no label for query '{query}'. Query will be omitted.")
|
||||
logger.warning("There is no label for query '%s'. Query will be omitted.", query)
|
||||
continue
|
||||
|
||||
# remarks for no_answers:
|
||||
@ -1925,7 +1927,7 @@ class Pipeline:
|
||||
|
||||
component_params = definitions[name].get("params", {})
|
||||
component_type = definitions[name]["type"]
|
||||
logger.debug(f"Loading component `{name}` of type `{definitions[name]['type']}`")
|
||||
logger.debug(f"Loading component '%s' of type '%s'", name, definitions[name]["type"])
|
||||
|
||||
for key, value in component_params.items():
|
||||
# Component params can reference to other components. For instance, a Retriever can reference a
|
||||
@ -2188,9 +2190,9 @@ class _HaystackBeirRetrieverAdapter:
|
||||
file_paths.append(file_path)
|
||||
metas.append({"id": id, "name": doc.get("title", None)})
|
||||
|
||||
logger.info(f"indexing {len(corpus)} documents...")
|
||||
logger.info("indexing %s documents...", len(corpus))
|
||||
self.index_pipeline.run(file_paths=file_paths, meta=metas, params=self.index_params)
|
||||
logger.info(f"indexing finished.")
|
||||
logger.info("indexing finished.")
|
||||
|
||||
# adjust query_params to ensure top_k is retrieved
|
||||
query_params = copy.deepcopy(self.query_params)
|
||||
|
||||
@ -208,7 +208,7 @@ def validate_yaml(
|
||||
"""
|
||||
pipeline_config = read_pipeline_config_from_yaml(path)
|
||||
validate_config(pipeline_config=pipeline_config, strict_version_check=strict_version_check, extras=extras)
|
||||
logging.debug(f"'{path}' contains valid Haystack pipelines.")
|
||||
logging.debug("'%s' contains valid Haystack pipelines.", path)
|
||||
|
||||
|
||||
def validate_config(
|
||||
@ -343,7 +343,7 @@ def validate_schema(pipeline_config: Dict, strict_version_check: bool = False, e
|
||||
f"Validation failed. {validation.message}. {error_location} " "See the stacktrace for more information."
|
||||
) from validation
|
||||
|
||||
logging.debug(f"The given configuration is valid according to the JSON schema.")
|
||||
logging.debug("The given configuration is valid according to the JSON schema.")
|
||||
|
||||
|
||||
def validate_pipeline_graph(pipeline_definition: Dict[str, Any], component_definitions: Dict[str, Any]):
|
||||
@ -357,7 +357,7 @@ def validate_pipeline_graph(pipeline_definition: Dict[str, Any], component_defin
|
||||
graph = _init_pipeline_graph(root_node_name=root_node_name)
|
||||
for node in pipeline_definition["nodes"]:
|
||||
graph = _add_node_to_pipeline_graph(graph=graph, node=node, components=component_definitions)
|
||||
logging.debug(f"The graph for pipeline '{pipeline_definition['name']}' is valid.")
|
||||
logging.debug("The graph for pipeline '%s' is valid.", pipeline_definition["name"])
|
||||
|
||||
|
||||
def _find_root_in_pipeline_definition(pipeline_definition: Dict[str, Any]):
|
||||
|
||||
@ -225,6 +225,9 @@ class Document:
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
values = self.to_dict()
|
||||
if values.get("embedding", False):
|
||||
values["embedding"] = f"<embedding of shape {values['embedding'].get('shape', '[no shape]')}>"
|
||||
return f"<Document: {str(self.to_dict())}>"
|
||||
|
||||
def __str__(self):
|
||||
@ -1389,7 +1392,7 @@ class EvaluationResult:
|
||||
index=False, quoting=csv.QUOTE_NONNUMERIC (to avoid problems with \r chars)
|
||||
"""
|
||||
out_dir = out_dir if isinstance(out_dir, Path) else Path(out_dir)
|
||||
logger.info(f"Saving evaluation results to {out_dir}")
|
||||
logger.info("Saving evaluation results to %s", out_dir)
|
||||
if not out_dir.exists():
|
||||
out_dir.mkdir(parents=True)
|
||||
for node_name, df in self.node_results.items():
|
||||
|
||||
@ -73,7 +73,7 @@ def enable_writing_events_to_file():
|
||||
Enables writing each event that is sent to the log file specified in LOG_PATH
|
||||
"""
|
||||
os.environ[HAYSTACK_TELEMETRY_LOGGING_TO_FILE_ENABLED] = "True"
|
||||
logger.info(f"Writing events to log file {LOG_PATH} has been enabled.")
|
||||
logger.info("Writing events to log file %s has been enabled.", LOG_PATH)
|
||||
|
||||
|
||||
def disable_writing_events_to_file():
|
||||
@ -81,7 +81,7 @@ def disable_writing_events_to_file():
|
||||
Disables writing each event that is sent to the log file specified in LOG_PATH
|
||||
"""
|
||||
os.environ[HAYSTACK_TELEMETRY_LOGGING_TO_FILE_ENABLED] = "False"
|
||||
logger.info(f"Writing events to log file {LOG_PATH} has been disabled.")
|
||||
logger.info("Writing events to log file %s has been disabled.", LOG_PATH)
|
||||
|
||||
|
||||
def is_telemetry_enabled() -> bool:
|
||||
@ -235,7 +235,7 @@ def _read_telemetry_config():
|
||||
if "user_id" in config and user_id is None:
|
||||
user_id = config["user_id"]
|
||||
except Exception as e:
|
||||
logger.debug(f"Telemetry was not able to read the config file {CONFIG_PATH}.", exc_info=e)
|
||||
logger.debug("Telemetry was not able to read the config file %s", CONFIG_PATH, exc_info=e)
|
||||
|
||||
|
||||
def _write_telemetry_config():
|
||||
@ -257,7 +257,7 @@ def _write_telemetry_config():
|
||||
with open(CONFIG_PATH, "w") as outfile:
|
||||
yaml.dump(config, outfile, default_flow_style=False)
|
||||
except Exception:
|
||||
logger.debug(f"Could not write config file to {CONFIG_PATH}.")
|
||||
logger.debug("Could not write config file to %s", CONFIG_PATH)
|
||||
send_custom_event(event="config saving failed")
|
||||
|
||||
|
||||
@ -266,7 +266,7 @@ def _write_event_to_telemetry_log_file(distinct_id: str, event: str, properties:
|
||||
with open(LOG_PATH, "a") as file_object:
|
||||
file_object.write(f"{event}, {properties}, {distinct_id}\n")
|
||||
except Exception as e:
|
||||
logger.debug(f"Telemetry was not able to write event to log file {LOG_PATH}.", exc_info=e)
|
||||
logger.debug("Telemetry was not able to write event to log file %s", LOG_PATH, exc_info=e)
|
||||
|
||||
|
||||
def _delete_telemetry_file(file_type_to_delete: TelemetryFileType):
|
||||
@ -279,7 +279,7 @@ def _delete_telemetry_file(file_type_to_delete: TelemetryFileType):
|
||||
try:
|
||||
path.unlink() # todo add missing_ok=True to the unlink() call when upgrading to python>3.7
|
||||
except Exception as e:
|
||||
logger.debug(f"Telemetry was not able to delete the {file_type_to_delete} at {path}.", exc_info=e)
|
||||
logger.debug("Telemetry was not able to delete the %s at %s", file_type_to_delete, path, exc_info=e)
|
||||
|
||||
|
||||
class NonPrivateParameters:
|
||||
|
||||
@ -546,7 +546,7 @@ class PipelineClient:
|
||||
pipelines_url = f"{workspace_url}/pipelines"
|
||||
response = self.client.post(url=pipelines_url, data=yaml.dump(config), headers=headers).json()
|
||||
if "name" not in response or response["name"] != pipeline_config_name:
|
||||
logger.warning(f"Unexpected response from saving pipeline config: {response}")
|
||||
logger.warning("Unexpected response from saving pipeline config: %s", response)
|
||||
|
||||
def update_pipeline_config(
|
||||
self,
|
||||
@ -568,7 +568,7 @@ class PipelineClient:
|
||||
yaml_url = f"{pipeline_url}/yaml"
|
||||
response = self.client.put(url=yaml_url, data=yaml.dump(config), headers=headers).json()
|
||||
if "name" not in response or response["name"] != pipeline_config_name:
|
||||
logger.warning(f"Unexpected response from updating pipeline config: {response}")
|
||||
logger.warning("Unexpected response from updating pipeline config: %s", response)
|
||||
|
||||
def deploy(
|
||||
self,
|
||||
@ -609,9 +609,9 @@ class PipelineClient:
|
||||
|
||||
if status == PipelineStatus.DEPLOYED:
|
||||
if changed:
|
||||
logger.info(f"Pipeline config '{pipeline_config_name}' successfully deployed.")
|
||||
logger.info("Pipeline config '%s' successfully deployed.", pipeline_config_name)
|
||||
else:
|
||||
logger.info(f"Pipeline config '{pipeline_config_name}' is already deployed.")
|
||||
logger.info("Pipeline config '%s' is already deployed.", pipeline_config_name)
|
||||
logger.info(
|
||||
f"Search endpoint for pipeline config '{pipeline_config_name}' is up and running for you under {pipeline_url}"
|
||||
)
|
||||
@ -628,7 +628,7 @@ class PipelineClient:
|
||||
f" ]\n"
|
||||
f"}}'"
|
||||
)
|
||||
logger.info(f"Try it out using the following curl command:\n{curl_cmd}")
|
||||
logger.info("Try it out using the following curl command:\n%s", curl_cmd)
|
||||
|
||||
elif status == PipelineStatus.DEPLOYMENT_FAILED:
|
||||
raise DeepsetCloudError(
|
||||
@ -673,9 +673,9 @@ class PipelineClient:
|
||||
|
||||
if status == PipelineStatus.UNDEPLOYED:
|
||||
if changed:
|
||||
logger.info(f"Pipeline config '{pipeline_config_name}' successfully undeployed.")
|
||||
logger.info("Pipeline config '%s' successfully undeployed.", pipeline_config_name)
|
||||
else:
|
||||
logger.info(f"Pipeline config '{pipeline_config_name}' is already undeployed.")
|
||||
logger.info("Pipeline config '%s' is already undeployed.", pipeline_config_name)
|
||||
elif status in [PipelineStatus.DEPLOYMENT_IN_PROGRESS, PipelineStatus.DEPLOYMENT_SCHEDULED]:
|
||||
raise DeepsetCloudError(
|
||||
f"Undeployment of pipeline config '{pipeline_config_name}' aborted. Deployment was requested."
|
||||
@ -754,7 +754,7 @@ class PipelineClient:
|
||||
raise DeepsetCloudError(f"Pipeline config '{pipeline_config_name}' does not exist anymore.")
|
||||
status = PipelineStatus.from_str(pipeline_info["status"])
|
||||
if status in valid_transitioning_states:
|
||||
logger.info(f"Current status of '{pipeline_config_name}' is: '{status}'")
|
||||
logger.info("Current status of '%s' is: '%s'", pipeline_config_name, status)
|
||||
time.sleep(5)
|
||||
|
||||
return status, True
|
||||
@ -907,7 +907,7 @@ class EvaluationSetClient:
|
||||
f"Successfully uploaded evaluation set file {file_path}. You can access it now under evaluation set '{file_path.name}'."
|
||||
)
|
||||
except DeepsetCloudError as e:
|
||||
logger.error(f"Error uploading evaluation set file {file_path}: {e.args}")
|
||||
logger.error("Error uploading evaluation set file %s: %s", file_path, e.args)
|
||||
|
||||
def get_evaluation_set(
|
||||
self, evaluation_set: Optional[str] = None, workspace: Optional[str] = None
|
||||
@ -993,9 +993,9 @@ class FileClient:
|
||||
file_id = response_file_upload.json().get("file_id")
|
||||
file_ids.append(file_id)
|
||||
except Exception as e:
|
||||
logger.exception(f"Error uploading file {file_path}")
|
||||
logger.exception("Error uploading file %s", file_path)
|
||||
|
||||
logger.info(f"Successfully uploaded {len(file_ids)} files.")
|
||||
logger.info("Successfully uploaded %s files.", len(file_ids))
|
||||
|
||||
def delete_file(self, file_id: str, workspace: Optional[str] = None, headers: dict = None):
|
||||
"""
|
||||
@ -1147,7 +1147,7 @@ class EvaluationRunClient:
|
||||
eval_run_url = f"{workspace_url}/eval_runs/{eval_run_name}"
|
||||
response = self.client.delete(eval_run_url, headers=headers)
|
||||
if response.status_code == 204:
|
||||
logger.info(f"Evaluation run '{eval_run_name}' deleted.")
|
||||
logger.info("Evaluation run '%s' deleted.", eval_run_name)
|
||||
|
||||
def start_eval_run(self, eval_run_name: str, workspace: Optional[str] = None, headers: dict = None):
|
||||
"""
|
||||
@ -1162,7 +1162,7 @@ class EvaluationRunClient:
|
||||
eval_run_url = f"{workspace_url}/eval_runs/{eval_run_name}/start"
|
||||
response = self.client.post(eval_run_url, headers=headers)
|
||||
if response.status_code == 204:
|
||||
logger.info(f"Evaluation run '{eval_run_name}' has been started.")
|
||||
logger.info("Evaluation run '%s' has been started.", eval_run_name)
|
||||
|
||||
def update_eval_run(
|
||||
self,
|
||||
|
||||
@ -79,7 +79,7 @@ def launch_weaviate(sleep=15):
|
||||
|
||||
|
||||
def stop_container(container_name, delete_container=False):
|
||||
logger.debug(f"Stopping {container_name}...")
|
||||
logger.debug("Stopping %s...", container_name)
|
||||
status = subprocess.run([f"docker stop {container_name}"], shell=True)
|
||||
if status.returncode:
|
||||
logger.warning(
|
||||
@ -117,7 +117,7 @@ def stop_service(document_store, delete_container=False):
|
||||
elif "WeaviateDocumentStore" in ds_class:
|
||||
stop_weaviate(delete_container)
|
||||
else:
|
||||
logger.warning(f"No support yet for auto stopping the service behind a {type(document_store)}")
|
||||
logger.warning("No support yet for auto stopping the service behind a %s", type(document_store))
|
||||
|
||||
|
||||
def launch_milvus(sleep=15, delete_existing=False):
|
||||
|
||||
@ -28,6 +28,6 @@ def cache_models(models: List[str] = None, use_auth_token: Optional[Union[str, b
|
||||
import transformers
|
||||
|
||||
for model_to_cache in models:
|
||||
logging.info(f"Caching {model_to_cache}")
|
||||
logging.info("Caching %s", model_to_cache)
|
||||
transformers.AutoTokenizer.from_pretrained(model_to_cache, use_auth_token=use_auth_token)
|
||||
transformers.AutoModel.from_pretrained(model_to_cache, use_auth_token=use_auth_token)
|
||||
|
||||
@ -117,19 +117,19 @@ class StdoutTrackingHead(BaseTrackingHead):
|
||||
def init_experiment(
|
||||
self, experiment_name: str, run_name: str = None, tags: Dict[str, Any] = None, nested: bool = False
|
||||
):
|
||||
logger.info(f"\n **** Starting experiment '{experiment_name}' (Run: {run_name}) ****")
|
||||
logger.info("\n **** Starting experiment '%s' (Run: %s) ****", experiment_name, run_name)
|
||||
|
||||
def track_metrics(self, metrics: Dict[str, Any], step: int):
|
||||
logger.info(f"Logged metrics at step {step}: \n {metrics}")
|
||||
logger.info("Logged metrics at step %s: \n %s", step, metrics)
|
||||
|
||||
def track_params(self, params: Dict[str, Any]):
|
||||
logger.info(f"Logged parameters: \n {params}")
|
||||
logger.info("Logged parameters: \n %s", params)
|
||||
|
||||
def track_artifacts(self, dir_path: Union[str, Path], artifact_path: str = None):
|
||||
logger.warning(f"Cannot log artifacts with StdoutLogger: \n {dir_path}")
|
||||
logger.warning("Cannot log artifacts with StdoutLogger: \n %s", dir_path)
|
||||
|
||||
def end_run(self):
|
||||
logger.info(f"**** End of Experiment **** ")
|
||||
logger.info("**** End of Experiment **** ")
|
||||
|
||||
|
||||
class MLflowTrackingHead(BaseTrackingHead):
|
||||
@ -148,7 +148,9 @@ class MLflowTrackingHead(BaseTrackingHead):
|
||||
mlflow.set_tracking_uri(self.tracking_uri)
|
||||
mlflow.set_experiment(experiment_name)
|
||||
mlflow.start_run(run_name=run_name, nested=nested, tags=tags)
|
||||
logger.info(f"Tracking run {run_name} of experiment {experiment_name} by mlflow under {self.tracking_uri}")
|
||||
logger.info(
|
||||
"Tracking run %s of experiment %s by mlflow under %s", run_name, experiment_name, self.tracking_uri
|
||||
)
|
||||
if self.auto_track_environment:
|
||||
mlflow.log_params(flatten_dict({"environment": get_or_create_env_meta_data()}))
|
||||
except ConnectionError:
|
||||
@ -163,9 +165,9 @@ class MLflowTrackingHead(BaseTrackingHead):
|
||||
metrics = flatten_dict(metrics)
|
||||
mlflow.log_metrics(metrics, step=step)
|
||||
except ConnectionError:
|
||||
logger.warning(f"ConnectionError in logging metrics to MLflow.")
|
||||
logger.warning("ConnectionError in logging metrics to MLflow.")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to log metrics: {e}")
|
||||
logger.warning("Failed to log metrics: %s", e)
|
||||
|
||||
def track_params(self, params: Dict[str, Any]):
|
||||
try:
|
||||
@ -174,15 +176,15 @@ class MLflowTrackingHead(BaseTrackingHead):
|
||||
except ConnectionError:
|
||||
logger.warning("ConnectionError in logging params to MLflow")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to log params: {e}")
|
||||
logger.warning("Failed to log params: %s", e)
|
||||
|
||||
def track_artifacts(self, dir_path: Union[str, Path], artifact_path: str = None):
|
||||
try:
|
||||
mlflow.log_artifacts(dir_path, artifact_path)
|
||||
except ConnectionError:
|
||||
logger.warning(f"ConnectionError in logging artifacts to MLflow")
|
||||
logger.warning("ConnectionError in logging artifacts to MLflow")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to log artifacts: {e}")
|
||||
logger.warning("Failed to log artifacts: %s", e)
|
||||
|
||||
def end_run(self):
|
||||
mlflow.end_run()
|
||||
|
||||
@ -81,10 +81,10 @@ def fetch_archive_from_http(url: str, output_dir: str, proxies: Optional[dict] =
|
||||
|
||||
is_not_empty = len(list(Path(path).rglob("*"))) > 0
|
||||
if is_not_empty:
|
||||
logger.info(f"Found data stored in `{output_dir}`. Delete this first if you really want to fetch new data.")
|
||||
logger.info("Found data stored in '%s'. Delete this first if you really want to fetch new data.", output_dir)
|
||||
return False
|
||||
else:
|
||||
logger.info(f"Fetching from {url} to `{output_dir}`")
|
||||
logger.info("Fetching from %s to '%s'", url, output_dir)
|
||||
|
||||
_, _, archive_extension = url.rpartition(".")
|
||||
request_data = requests.get(url, proxies=proxies)
|
||||
|
||||
@ -192,8 +192,8 @@ def create_dpr_training_dataset(squad_data: dict, retriever: BaseRetriever, num_
|
||||
n_questions += 1
|
||||
yield dict_DPR
|
||||
|
||||
logger.info(f"Number of skipped questions: {n_non_added_questions}")
|
||||
logger.info(f"Number of added questions: {n_questions}")
|
||||
logger.info("Number of skipped questions: %s", n_non_added_questions)
|
||||
logger.info("Number of added questions: %s", n_questions)
|
||||
|
||||
|
||||
def save_dataset(iter_dpr: Iterator, dpr_output_filename: Path, total_nb_questions: int, split_dataset: bool):
|
||||
|
||||
@ -26,17 +26,17 @@ def setup_pipelines() -> Dict[str, Any]:
|
||||
|
||||
# Load query pipeline
|
||||
query_pipeline = Pipeline.load_from_yaml(Path(config.PIPELINE_YAML_PATH), pipeline_name=config.QUERY_PIPELINE_NAME)
|
||||
logging.info(f"Loaded pipeline nodes: {query_pipeline.graph.nodes.keys()}")
|
||||
logging.info("Loaded pipeline nodes: %s", query_pipeline.graph.nodes.keys())
|
||||
pipelines["query_pipeline"] = query_pipeline
|
||||
|
||||
# Find document store
|
||||
document_store = query_pipeline.get_document_store()
|
||||
logging.info(f"Loaded docstore: {document_store}")
|
||||
logging.info("Loaded docstore: %s", document_store)
|
||||
pipelines["document_store"] = document_store
|
||||
|
||||
# Setup concurrency limiter
|
||||
concurrency_limiter = RequestLimiter(config.CONCURRENT_REQUEST_PER_WORKER)
|
||||
logging.info("Concurrent requests per worker: {CONCURRENT_REQUEST_PER_WORKER}")
|
||||
logging.info("Concurrent requests per worker: %s", config.CONCURRENT_REQUEST_PER_WORKER)
|
||||
pipelines["concurrency_limiter"] = concurrency_limiter
|
||||
|
||||
# Load indexing pipeline (if available)
|
||||
@ -53,7 +53,7 @@ def setup_pipelines() -> Dict[str, Any]:
|
||||
|
||||
except PipelineConfigError as e:
|
||||
indexing_pipeline = None
|
||||
logger.error(f"{e.message}\nFile Upload API will not be available.")
|
||||
logger.error("%s\nFile Upload API will not be available.", e.message)
|
||||
|
||||
finally:
|
||||
pipelines["indexing_pipeline"] = indexing_pipeline
|
||||
|
||||
@ -122,7 +122,7 @@ def main():
|
||||
for current_config in combine_config(distillation_settings):
|
||||
descriptions.append(f"Results of student with distillation (config: {current_config}")
|
||||
# distillation training
|
||||
logger.info(f"Training student with distillation (config: {current_config}")
|
||||
logger.info("Training student with distillation (config: %s)", current_config)
|
||||
results.append(
|
||||
train_student_with_distillation(
|
||||
student, teacher, download_folder, train_file, test_file, **current_config, **training_settings
|
||||
@ -145,9 +145,9 @@ def main():
|
||||
logger.info("Evaluation results:")
|
||||
for result, description in zip(results, descriptions):
|
||||
logger.info(description)
|
||||
logger.info(f"EM: {result['EM']}")
|
||||
logger.info(f"F1: {result['f1']}")
|
||||
logger.info(f"Top n accuracy: {result['top_n_accuracy']}")
|
||||
logger.info("EM: %s", result["EM"])
|
||||
logger.info("F1: %s", result["f1"])
|
||||
logger.info("Top n accuracy: %s", result["top_n_accuracy"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -51,7 +51,7 @@ def benchmark_reader(ci=False, update_json=False, save_markdown=False, **kwargs)
|
||||
index_to_doc_store(doc_store, docs, None, labels)
|
||||
for reader_name in reader_models:
|
||||
for reader_type in reader_types:
|
||||
logger.info(f"##### Start reader run - model:{reader_name}, type: {reader_type} ##### ")
|
||||
logger.info("##### Start reader run - model: %s, type: %s ##### ", reader_name, reader_type)
|
||||
try:
|
||||
reader = get_reader(reader_name, reader_type)
|
||||
results = reader.eval(
|
||||
|
||||
@ -56,7 +56,7 @@ def benchmark_indexing(
|
||||
retriever_results = []
|
||||
for n_docs in n_docs_options:
|
||||
for retriever_name, doc_store_name in retriever_doc_stores:
|
||||
logger.info(f"##### Start indexing run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### ")
|
||||
logger.info("##### Start indexing run: %s, %s, %s docs ##### ", retriever_name, doc_store_name, n_docs)
|
||||
try:
|
||||
doc_store = get_document_store(doc_store_name)
|
||||
retriever = get_retriever(retriever_name, doc_store, DEVICES)
|
||||
@ -160,7 +160,7 @@ def benchmark_querying(
|
||||
for n_docs in n_docs_options:
|
||||
for retriever_name, doc_store_name in retriever_doc_stores:
|
||||
try:
|
||||
logger.info(f"##### Start querying run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### ")
|
||||
logger.info("##### Start querying run: %s, %s, %s docs ##### ", retriever_name, doc_store_name, n_docs)
|
||||
if retriever_name in ["elastic", "sentence_transformers"]:
|
||||
similarity = "cosine"
|
||||
else:
|
||||
@ -273,7 +273,7 @@ def add_precomputed_embeddings(embeddings_dir, embeddings_filenames, docs):
|
||||
ret = []
|
||||
id_to_doc = {x.meta["passage_id"]: x for x in docs}
|
||||
for ef in embeddings_filenames:
|
||||
logger.info(f"Adding precomputed embeddings from {embeddings_dir + ef}")
|
||||
logger.info("Adding precomputed embeddings from %s", embeddings_dir + ef)
|
||||
filename = embeddings_dir + ef
|
||||
embeds = pickle.load(open(filename, "rb"))
|
||||
for i, vec in embeds:
|
||||
@ -284,7 +284,7 @@ def add_precomputed_embeddings(embeddings_dir, embeddings_filenames, docs):
|
||||
# In the official DPR repo, there are only 20594995 precomputed embeddings for 21015324 wikipedia passages
|
||||
# If there isn't an embedding for a given doc, we remove it here
|
||||
ret = [x for x in ret if x.embedding is not None]
|
||||
logger.info(f"Embeddings loaded for {len(ret)}/{len(docs)} docs")
|
||||
logger.info("Embeddings loaded for %s/%s docs", len(ret), len(docs))
|
||||
return ret
|
||||
|
||||
|
||||
|
||||
@ -170,16 +170,16 @@ def download_from_url(url: str, filepath: Union[str, Path]):
|
||||
:return: local path of the downloaded file
|
||||
"""
|
||||
|
||||
logger.info(f"Downloading {url}")
|
||||
logger.info("Downloading %s", url)
|
||||
# Create local folder
|
||||
folder, filename = os.path.split(filepath)
|
||||
if not os.path.exists(folder):
|
||||
os.makedirs(folder)
|
||||
# Download file if not present locally
|
||||
if os.path.exists(filepath):
|
||||
logger.info(f"Skipping {url} (exists locally)")
|
||||
logger.info("Skipping %s (exists locally)", url)
|
||||
else:
|
||||
logger.info(f"Downloading {url} to {filepath} ")
|
||||
logger.info("Downloading %s to %s", filepath)
|
||||
with open(filepath, "wb") as file:
|
||||
http_get(url=url, temp_file=file)
|
||||
return filepath
|
||||
|
||||
@ -1024,7 +1024,7 @@ def adaptive_model_qa(num_processes):
|
||||
current_process = psutil.Process()
|
||||
children = current_process.children()
|
||||
if len(children) != 0:
|
||||
logging.error(f"Not all the subprocesses are closed! {len(children)} are still running.")
|
||||
logging.error("Not all the subprocesses are closed! %s are still running.", len(children))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
||||
@ -132,7 +132,7 @@ class Index:
|
||||
response: dict = {"namespace": namespace, "vectors": {}}
|
||||
if namespace not in self.index_config.namespaces:
|
||||
# If we query an empty/non-existent namespace, Pinecone will just return an empty response
|
||||
logger.warning(f"No namespace called '{namespace}'")
|
||||
logger.warning("No namespace called '%s'", namespace)
|
||||
return response
|
||||
records = self.index_config.namespaces[namespace]
|
||||
namespace_ids = records.keys()
|
||||
|
||||
@ -56,11 +56,11 @@ def reduce_windows_recursion_limit():
|
||||
default_recursion_limit = sys.getrecursionlimit()
|
||||
if is_windows:
|
||||
reduced_recursion_limit = default_recursion_limit // 2
|
||||
logger.warning(f"Reducing recursion limit to {reduced_recursion_limit}")
|
||||
logger.warning("Reducing recursion limit to %s", reduced_recursion_limit)
|
||||
sys.setrecursionlimit(reduced_recursion_limit)
|
||||
yield
|
||||
if is_windows:
|
||||
logger.warning(f"Resetting recursion limit to {default_recursion_limit}")
|
||||
logger.warning("Resetting recursion limit to %s", default_recursion_limit)
|
||||
sys.setrecursionlimit(default_recursion_limit)
|
||||
|
||||
|
||||
|
||||
@ -30,9 +30,9 @@ def tutorial5_evaluation():
|
||||
doc_index = "tutorial5_docs"
|
||||
label_index = "tutorial5_labels"
|
||||
|
||||
##############################################
|
||||
###############################################
|
||||
# Code
|
||||
##############################################
|
||||
###############################################
|
||||
launch_es()
|
||||
|
||||
# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents with one question per document and multiple annotated answers
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user