mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-01 18:29:32 +00:00
feat: reduce and focus telemetry (#4087)
* simplified telemetry and docker containers detection * pylint * mypy * mypy * Add new credentials and metadata * remove prints * mypy * remove comment * simplify inout len measurement * black * removed old telemetry, to revert * reintroduce env function * reintroduce old telemetry * fix telemetry selection * telemetry for promptnode * telemetry for some training methods * telemetry for eval and distillation * mypy & pylint * review * Update lg * mypy * improve docstrings * pylint * mypy * fix test * linting * remove old tests --------- Co-authored-by: agnieszka-m <amarzec13@gmail.com>
This commit is contained in:
parent
181e5474e8
commit
f816efa50c
@ -25,3 +25,10 @@ from haystack.environment import set_pytorch_secure_model_loading
|
||||
|
||||
pd.options.display.max_colwidth = 80
|
||||
set_pytorch_secure_model_loading()
|
||||
|
||||
import os
|
||||
|
||||
|
||||
from haystack.telemetry_2 import send_event
|
||||
|
||||
send_event(event_name="Haystack imported")
|
||||
|
||||
@ -2,16 +2,13 @@ import logging
|
||||
import os
|
||||
import platform
|
||||
import sys
|
||||
from typing import Any, Dict
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import torch
|
||||
import transformers
|
||||
|
||||
from haystack import __version__
|
||||
|
||||
HAYSTACK_EXECUTION_CONTEXT = "HAYSTACK_EXECUTION_CONTEXT"
|
||||
HAYSTACK_DOCKER_CONTAINER = "HAYSTACK_DOCKER_CONTAINER"
|
||||
|
||||
# Any remote API (OpenAI, Cohere etc.)
|
||||
HAYSTACK_REMOTE_API_BACKOFF_SEC = "HAYSTACK_REMOTE_API_BACKOFF_SEC"
|
||||
HAYSTACK_REMOTE_API_MAX_RETRIES = "HAYSTACK_REMOTE_API_MAX_RETRIES"
|
||||
@ -32,10 +29,65 @@ def set_pytorch_secure_model_loading(flag_val="1"):
|
||||
logger.info("TORCH_FORCE_WEIGHTS_ONLY_LOAD is already set to %s, Haystack will use the same.", os_flag_val)
|
||||
|
||||
|
||||
def is_containerized() -> Optional[bool]:
|
||||
# https://www.baeldung.com/linux/is-process-running-inside-container
|
||||
# Using CPU scheduling info as I found it to be the only one usable on my machine.
|
||||
path = "/proc/1/sched"
|
||||
try:
|
||||
if os.path.exists("/.dockerenv"):
|
||||
return True
|
||||
with open(path, "r") as cgroupfile:
|
||||
first_line = cgroupfile.readline()
|
||||
if first_line.startswith("systemd") or first_line.startswith("init"):
|
||||
return False
|
||||
return True
|
||||
except Exception:
|
||||
logger.debug("Failed to detect if Haystack is running in a container (for telemetry purposes).")
|
||||
return None
|
||||
|
||||
|
||||
def collect_static_system_specs() -> Dict[str, Any]:
|
||||
"""
|
||||
Collects meta data about the setup that is used with Haystack, such as:
|
||||
operating system, python version, Haystack version, transformers version,
|
||||
pytorch version, number of GPUs, execution environment.
|
||||
"""
|
||||
return {
|
||||
"libraries.haystack": __version__,
|
||||
"libraries.transformers": transformers.__version__ if "transformers" in sys.modules.keys() else False,
|
||||
"libraries.torch": torch.__version__ if "torch" in sys.modules.keys() else False,
|
||||
"libraries.cuda": torch.version.cuda if "torch" in sys.modules.keys() and torch.cuda.is_available() else False,
|
||||
"os.containerized": is_containerized(),
|
||||
# FIXME review these
|
||||
"os.version": platform.release(),
|
||||
"os.family": platform.system(),
|
||||
"os.machine": platform.machine(),
|
||||
"python.version": platform.python_version(), # FIXME verify
|
||||
"hardware.cpus": os.cpu_count(), # FIXME verify
|
||||
"hardware.gpus": torch.cuda.device_count() if torch.cuda.is_available() else 0, # probably ok
|
||||
}
|
||||
|
||||
|
||||
def collect_dynamic_system_specs() -> Dict[str, Any]:
|
||||
return {
|
||||
"libraries.pytest": sys.modules["pytest"].__version__ if "pytest" in sys.modules.keys() else False,
|
||||
"libraries.ray": sys.modules["ray"].__version__ if "ray" in sys.modules.keys() else False,
|
||||
"libraries.ipython": sys.modules["ipython"].__version__ if "ipython" in sys.modules.keys() else False,
|
||||
"libraries.colab": sys.modules["pytest"].__version__ if "google.colab" in sys.modules.keys() else False,
|
||||
}
|
||||
|
||||
|
||||
#
|
||||
# Old telemetry
|
||||
#
|
||||
|
||||
|
||||
def get_or_create_env_meta_data() -> Dict[str, Any]:
|
||||
"""
|
||||
Collects meta data about the setup that is used with Haystack, such as: operating system, python version, Haystack version, transformers version, pytorch version, number of GPUs, execution environment, and the value stored in the env variable HAYSTACK_EXECUTION_CONTEXT.
|
||||
"""
|
||||
from haystack.telemetry import HAYSTACK_EXECUTION_CONTEXT
|
||||
|
||||
global env_meta_data # pylint: disable=global-statement
|
||||
if not env_meta_data:
|
||||
env_meta_data = {
|
||||
@ -60,6 +112,8 @@ def _get_execution_environment():
|
||||
Identifies the execution environment that Haystack is running in.
|
||||
Options are: colab notebook, kubernetes, CPU/GPU docker container, test environment, jupyter notebook, python script
|
||||
"""
|
||||
from haystack.telemetry import HAYSTACK_DOCKER_CONTAINER
|
||||
|
||||
if os.environ.get("CI", "False").lower() == "true":
|
||||
execution_env = "ci"
|
||||
elif "google.colab" in sys.modules:
|
||||
|
||||
@ -12,6 +12,7 @@ from haystack.modeling.model.adaptive_model import AdaptiveModel
|
||||
from haystack.modeling.model.biadaptive_model import BiAdaptiveModel
|
||||
from haystack.modeling.model.optimization import WrappedDataParallel
|
||||
from haystack.utils.experiment_tracking import Tracker as tracker
|
||||
from haystack.telemetry_2 import send_event
|
||||
from haystack.modeling.visual import BUSH_SEP
|
||||
|
||||
|
||||
@ -57,6 +58,7 @@ class Evaluator:
|
||||
:return: all_results: A list of dictionaries, one for each prediction head. Each dictionary contains the metrics
|
||||
and reports generated during evaluation.
|
||||
"""
|
||||
send_event("Evaluator.eval()")
|
||||
model.prediction_heads[0].use_confidence_scores_for_ranking = use_confidence_scores_for_ranking
|
||||
model.prediction_heads[0].use_no_answer_legacy_confidence = use_no_answer_legacy_confidence
|
||||
model.eval()
|
||||
|
||||
@ -22,6 +22,7 @@ from haystack.modeling.model.optimization import get_scheduler, WrappedDataParal
|
||||
from haystack.modeling.utils import GracefulKiller
|
||||
from haystack.utils.experiment_tracking import Tracker as tracker
|
||||
from haystack.utils.early_stopping import EarlyStopping
|
||||
from haystack.telemetry import send_event
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -163,6 +164,7 @@ class Trainer:
|
||||
:return: Returns the model after training. When you do ``early_stopping``
|
||||
with a ``save_dir`` the best model is loaded and returned.
|
||||
"""
|
||||
send_event("Trainer.train()")
|
||||
# connect the prediction heads with the right output from processor
|
||||
self.model.connect_heads_with_processor(self.data_silo.processor.tasks, require_labels=True)
|
||||
# Check that the tokenizer(s) fits the language model(s)
|
||||
|
||||
@ -30,6 +30,7 @@ from haystack.modeling.utils import initialize_device_settings
|
||||
from haystack.nodes.base import BaseComponent
|
||||
from haystack.schema import Document
|
||||
from haystack.utils.reflection import retry_with_exponential_backoff
|
||||
from haystack.telemetry_2 import send_event
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -769,6 +770,7 @@ class PromptNode(BaseComponent):
|
||||
:param stop_words: Stops text generation if any one of the stop words is generated.
|
||||
:param model_kwargs: Additional keyword arguments passed when loading the model specified by `model_name_or_path`.
|
||||
"""
|
||||
send_event("PromptNode initialized")
|
||||
super().__init__()
|
||||
self.prompt_templates: Dict[str, PromptTemplate] = {pt.name: pt for pt in get_predefined_prompt_templates()} # type: ignore
|
||||
self.default_prompt_template: Union[str, PromptTemplate, None] = default_prompt_template
|
||||
@ -827,6 +829,7 @@ class PromptNode(BaseComponent):
|
||||
:param prompt_template: The name or object of the optional PromptTemplate to use.
|
||||
:return: A list of strings as model responses.
|
||||
"""
|
||||
send_event("PromptNode.prompt()", event_properties={"template": str(prompt_template)})
|
||||
results = []
|
||||
# we pop the prompt_collector kwarg to avoid passing it to the model
|
||||
prompt_collector: List[str] = kwargs.pop("prompt_collector", [])
|
||||
|
||||
@ -28,6 +28,7 @@ from haystack.schema import Document, Answer, Span
|
||||
from haystack.document_stores.base import BaseDocumentStore
|
||||
from haystack.nodes.reader.base import BaseReader
|
||||
from haystack.utils.early_stopping import EarlyStopping
|
||||
from haystack.telemetry_2 import send_event
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -434,6 +435,7 @@ class FARMReader(BaseReader):
|
||||
:param max_query_length: Maximum length of the question in number of tokens.
|
||||
:return: None
|
||||
"""
|
||||
send_event("FARMReader.train()")
|
||||
return self._training_procedure(
|
||||
data_dir=data_dir,
|
||||
train_filename=train_filename,
|
||||
@ -555,6 +557,7 @@ class FARMReader(BaseReader):
|
||||
:param early_stopping: An initialized EarlyStopping object to control early stopping and saving of the best models.
|
||||
:return: None
|
||||
"""
|
||||
send_event("FARMReader.distil_prediction_layer_from()")
|
||||
return self._training_procedure(
|
||||
data_dir=data_dir,
|
||||
train_filename=train_filename,
|
||||
@ -677,6 +680,7 @@ class FARMReader(BaseReader):
|
||||
:param early_stopping: An initialized EarlyStopping object to control early stopping and saving of the best models.
|
||||
:return: None
|
||||
"""
|
||||
send_event("FARMReader.distil_intermediate_layers_from()")
|
||||
return self._training_procedure(
|
||||
data_dir=data_dir,
|
||||
train_filename=train_filename,
|
||||
@ -938,7 +942,7 @@ class FARMReader(BaseReader):
|
||||
"Hence, results might slightly differ from those of `Pipeline.eval()`\n."
|
||||
"If you are just about starting to evaluate your model consider using `Pipeline.eval()` instead."
|
||||
)
|
||||
|
||||
send_event("FARMReader.eval_on_file()")
|
||||
if device is None:
|
||||
device = self.devices[0]
|
||||
else:
|
||||
@ -1016,7 +1020,7 @@ class FARMReader(BaseReader):
|
||||
"Hence, results might slightly differ from those of `Pipeline.eval()`\n."
|
||||
"If you are just about starting to evaluate your model consider using `Pipeline.eval()` instead."
|
||||
)
|
||||
|
||||
send_event("FARMReader.eval()")
|
||||
if device is None:
|
||||
device = self.devices[0]
|
||||
else:
|
||||
|
||||
@ -31,6 +31,7 @@ from haystack.nodes.retriever._losses import _TRAINING_LOSSES
|
||||
from haystack.nodes.retriever._openai_encoder import _OpenAIEmbeddingEncoder
|
||||
from haystack.schema import Document
|
||||
from haystack.utils.reflection import retry_with_exponential_backoff
|
||||
from haystack.telemetry_2 import send_event
|
||||
|
||||
from ._base_embedding_encoder import _BaseEmbeddingEncoder
|
||||
|
||||
@ -199,6 +200,7 @@ class _SentenceTransformersEmbeddingEncoder(_BaseEmbeddingEncoder):
|
||||
reference the Sentence-Transformers [documentation](https://www.sbert.net/docs/training/overview.html#sentence_transformers.SentenceTransformer.fit)
|
||||
for a full list of keyword arguments.
|
||||
"""
|
||||
send_event("SentenceTransformersEmbeddingEncoder.train()")
|
||||
|
||||
if train_loss not in _TRAINING_LOSSES:
|
||||
raise ValueError(f"Unrecognized train_loss {train_loss}. Should be one of: {_TRAINING_LOSSES.keys()}")
|
||||
|
||||
@ -10,6 +10,7 @@ from tqdm.auto import tqdm
|
||||
from haystack.schema import Document, MultiLabel
|
||||
from haystack.errors import HaystackError, PipelineError
|
||||
from haystack.nodes.base import BaseComponent
|
||||
from haystack.telemetry_2 import send_event
|
||||
from haystack.document_stores.base import BaseDocumentStore, BaseKnowledgeGraph, FilterType
|
||||
|
||||
|
||||
@ -153,7 +154,7 @@ class BaseRetriever(BaseComponent):
|
||||
contains the keys "predictions" and "metrics".
|
||||
:param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
|
||||
"""
|
||||
|
||||
send_event("BaseRetriever.eval()")
|
||||
# Extract all questions for evaluation
|
||||
filters: Dict = {"origin": [label_origin]}
|
||||
|
||||
|
||||
@ -43,6 +43,7 @@ from haystack.modeling.data_handler.dataloader import NamedDataLoader
|
||||
from haystack.modeling.model.optimization import initialize_optimizer
|
||||
from haystack.modeling.training.base import Trainer
|
||||
from haystack.modeling.utils import initialize_device_settings
|
||||
from haystack.telemetry_2 import send_event
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -654,6 +655,7 @@ class DensePassageRetriever(DenseRetriever):
|
||||
Checkpoints can be stored via setting `checkpoint_every` to a custom number of steps.
|
||||
If any checkpoints are stored, a subsequent run of train() will resume training from the latest available checkpoint.
|
||||
"""
|
||||
send_event("DensePassageRetriever.train()")
|
||||
self.processor.embed_title = embed_title
|
||||
self.processor.data_dir = Path(data_dir)
|
||||
self.processor.train_filename = train_filename
|
||||
@ -1305,6 +1307,7 @@ class TableTextRetriever(DenseRetriever):
|
||||
:param checkpoints_to_keep: The maximum number of train checkpoints to save.
|
||||
:param early_stopping: An initialized EarlyStopping object to control early stopping and saving of the best models.
|
||||
"""
|
||||
send_event("TableTextRetriever.train()")
|
||||
if embed_meta_fields is None:
|
||||
embed_meta_fields = ["page_title", "section_title", "caption"]
|
||||
|
||||
@ -1910,6 +1913,7 @@ class EmbeddingRetriever(DenseRetriever):
|
||||
reference the Sentence-Transformers [documentation](https://www.sbert.net/docs/training/overview.html#sentence_transformers.SentenceTransformer.fit)
|
||||
for a full list of keyword arguments.
|
||||
"""
|
||||
send_event("EmbeddingRetriever.train()")
|
||||
self.embedding_encoder.train(
|
||||
training_data,
|
||||
learning_rate=learning_rate,
|
||||
|
||||
@ -22,6 +22,7 @@ import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
import mmh3
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import networkx as nx
|
||||
@ -53,6 +54,7 @@ from haystack.nodes.retriever.base import BaseRetriever
|
||||
from haystack.document_stores.base import BaseDocumentStore
|
||||
from haystack.telemetry import send_event, send_custom_event, is_telemetry_enabled
|
||||
from haystack.utils.experiment_tracking import MLflowTrackingHead, Tracker as tracker
|
||||
from haystack.telemetry_2 import send_pipeline_run_event, send_pipeline_event, send_event as send_event_2
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -79,6 +81,7 @@ class Pipeline:
|
||||
self.last_window_run_total = 0
|
||||
self.run_total = 0
|
||||
self.sent_event_in_window = False
|
||||
self.yaml_hash = False
|
||||
|
||||
@property
|
||||
def root_node(self) -> Optional[str]:
|
||||
@ -432,6 +435,14 @@ class Pipeline:
|
||||
node={"name": name, "inputs": inputs},
|
||||
instance=component,
|
||||
)
|
||||
# TELEMETRY: Hash the config of the pipeline without node names
|
||||
# to be able to cluster later by "pipeline type"
|
||||
# (is any specific pipeline configuration very popular?)
|
||||
fingerprint_config = copy.copy(self.get_config())
|
||||
for comp in fingerprint_config["components"]:
|
||||
del comp["name"]
|
||||
fingerprint = json.dumps(fingerprint_config, default=str)
|
||||
self.fingerprint = "{:02x}".format(mmh3.hash128(fingerprint, signed=False))
|
||||
|
||||
def get_node(self, name: str) -> Optional[BaseComponent]:
|
||||
"""
|
||||
@ -482,6 +493,18 @@ class Pipeline:
|
||||
about their execution. By default, this information includes the input parameters
|
||||
the Nodes received and the output they generated. You can then find all debug information in the dictionary returned by this method under the key `_debug`.
|
||||
"""
|
||||
send_pipeline_run_event(
|
||||
pipeline=self,
|
||||
event_name="Pipeline.run()",
|
||||
query=query,
|
||||
file_paths=file_paths,
|
||||
labels=labels,
|
||||
documents=documents,
|
||||
meta=meta,
|
||||
params=params,
|
||||
debug=debug,
|
||||
)
|
||||
|
||||
# validate the node names
|
||||
self._validate_node_names_in_params(params=params)
|
||||
|
||||
@ -618,6 +641,18 @@ class Pipeline:
|
||||
about their execution. By default, this information includes the input parameters
|
||||
the Nodes received and the output they generated. You can then find all debug information in the dictionary returned by this method under the key `_debug`.
|
||||
"""
|
||||
send_pipeline_run_event(
|
||||
pipeline=self,
|
||||
event_name="Pipeline.run_batch()",
|
||||
queries=queries,
|
||||
file_paths=file_paths,
|
||||
labels=labels,
|
||||
documents=documents,
|
||||
meta=meta,
|
||||
params=params,
|
||||
debug=debug,
|
||||
)
|
||||
|
||||
if file_paths is not None or meta is not None:
|
||||
logger.info(
|
||||
"It seems that an indexing Pipeline is run, so using the nodes' run method instead of run_batch."
|
||||
@ -773,6 +808,17 @@ class Pipeline:
|
||||
Returns a tuple containing the ncdg, map, recall and precision scores.
|
||||
Each metric is represented by a dictionary containing the scores for each top_k value.
|
||||
"""
|
||||
send_event_2(
|
||||
event_name="Pipeline.eval_beir()",
|
||||
event_properties={
|
||||
"dataset": dataset,
|
||||
"index_pipeline": index_pipeline.yaml_hash,
|
||||
"query_pipeline": query_pipeline.yaml_hash,
|
||||
"num_documents": num_documents,
|
||||
"top_k_values": top_k_values,
|
||||
},
|
||||
)
|
||||
|
||||
if index_params is None:
|
||||
index_params = {}
|
||||
if query_params is None:
|
||||
@ -1211,6 +1257,8 @@ class Pipeline:
|
||||
Additional information can be found here
|
||||
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
|
||||
"""
|
||||
send_pipeline_event(pipeline=self, event_name="Pipeline.eval()")
|
||||
|
||||
eval_result = EvaluationResult()
|
||||
if add_isolated_node_eval:
|
||||
params = {} if params is None else params.copy()
|
||||
@ -1328,6 +1376,8 @@ class Pipeline:
|
||||
Additional information can be found here
|
||||
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
|
||||
"""
|
||||
send_pipeline_event(pipeline=self, event_name="Pipeline.eval_batch()")
|
||||
|
||||
eval_result = EvaluationResult()
|
||||
if add_isolated_node_eval:
|
||||
params = {} if params is None else params.copy()
|
||||
@ -1956,14 +2006,15 @@ class Pipeline:
|
||||
`_` sign must be used to specify nested hierarchical properties.
|
||||
:param strict_version_check: whether to fail in case of a version mismatch (throws a warning otherwise)
|
||||
"""
|
||||
|
||||
config = read_pipeline_config_from_yaml(path)
|
||||
return cls.load_from_config(
|
||||
pipeline = cls.load_from_config(
|
||||
pipeline_config=config,
|
||||
pipeline_name=pipeline_name,
|
||||
overwrite_with_env_variables=overwrite_with_env_variables,
|
||||
strict_version_check=strict_version_check,
|
||||
)
|
||||
pipeline.yaml_hash = "{:02x}".format(mmh3.hash128(str(path), signed=False))
|
||||
return pipeline
|
||||
|
||||
@classmethod
|
||||
def load_from_config(
|
||||
|
||||
@ -17,10 +17,12 @@ from pathlib import Path
|
||||
import yaml
|
||||
import posthog
|
||||
|
||||
from haystack.environment import HAYSTACK_EXECUTION_CONTEXT, get_or_create_env_meta_data
|
||||
from haystack.environment import get_or_create_env_meta_data
|
||||
|
||||
posthog.api_key = "phc_F5v11iI2YHkoP6Er3cPILWSrLhY3D6UY4dEMga4eoaa"
|
||||
posthog.host = "https://tm.hs.deepset.ai"
|
||||
HAYSTACK_EXECUTION_CONTEXT = "HAYSTACK_EXECUTION_CONTEXT"
|
||||
HAYSTACK_DOCKER_CONTAINER = "HAYSTACK_DOCKER_CONTAINER"
|
||||
HAYSTACK_TELEMETRY_ENABLED = "HAYSTACK_TELEMETRY_ENABLED"
|
||||
HAYSTACK_TELEMETRY_LOGGING_TO_FILE_ENABLED = "HAYSTACK_TELEMETRY_LOGGING_TO_FILE_ENABLED"
|
||||
CONFIG_PATH = Path("~/.haystack/config.yaml").expanduser()
|
||||
@ -142,6 +144,8 @@ def send_custom_event(event: str = "", payload: Optional[Dict[str, Any]] = None)
|
||||
:param event: Name of the event. Use a noun and a verb, e.g., "evaluation started", "component created"
|
||||
:param payload: A dictionary containing event meta data, e.g., parameter settings
|
||||
"""
|
||||
if os.environ.get("HAYSTACK_TELEMETRY_VERSION", "2") != "1":
|
||||
return
|
||||
global user_id # pylint: disable=global-statement
|
||||
if payload is None:
|
||||
payload = {}
|
||||
@ -179,6 +183,7 @@ def send_custom_event(event: str = "", payload: Optional[Dict[str, Any]] = None)
|
||||
return
|
||||
|
||||
except Exception as e:
|
||||
print("Exception! ", e)
|
||||
logger.debug("Telemetry was not able to send an event.", exc_info=e)
|
||||
|
||||
|
||||
|
||||
238
haystack/telemetry_2.py
Normal file
238
haystack/telemetry_2.py
Normal file
@ -0,0 +1,238 @@
|
||||
import os
|
||||
from typing import Any, Dict, Optional, List, Union
|
||||
import uuid
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import json
|
||||
import yaml
|
||||
|
||||
import posthog
|
||||
|
||||
from haystack.environment import collect_static_system_specs, collect_dynamic_system_specs
|
||||
|
||||
HAYSTACK_TELEMETRY_ENABLED = "HAYSTACK_TELEMETRY_ENABLED"
|
||||
HAYSTACK_EXECUTION_CONTEXT = "HAYSTACK_EXECUTION_CONTEXT"
|
||||
HAYSTACK_DOCKER_CONTAINER = "HAYSTACK_DOCKER_CONTAINER"
|
||||
CONFIG_PATH = Path("~/.haystack/config.yaml").expanduser()
|
||||
LOG_PATH = Path("~/.haystack/telemetry.log").expanduser()
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Telemetry:
|
||||
"""
|
||||
Haystack reports anonymous usage statistics to support continuous software improvements for all its users.
|
||||
|
||||
You can opt-out of sharing usage statistics by manually setting the environment
|
||||
variable `HAYSTACK_TELEMETRY_ENABLED` as described for different operating systems on the
|
||||
[documentation page](https://docs.haystack.deepset.ai/docs/telemetry#how-can-i-opt-out).
|
||||
|
||||
Check out the documentation for more details: [Telemetry](https://docs.haystack.deepset.ai/docs/telemetry).
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Initializes the telemetry. Loads the user_id from the config file,
|
||||
or creates a new id and saves it if the file is not found.
|
||||
|
||||
It also collects system information which cannot change across the lifecycle
|
||||
of the process (for example `is_containerized()`).
|
||||
"""
|
||||
posthog.api_key = "phc_C44vUK9R1J6HYVdfJarTEPqVAoRPJzMXzFcj8PIrJgP"
|
||||
posthog.host = "https://eu.posthog.com"
|
||||
|
||||
# disable posthog logging
|
||||
for module_name in ["posthog", "backoff"]:
|
||||
logging.getLogger(module_name).setLevel(logging.CRITICAL)
|
||||
# Prevent module from sending errors to stderr when an exception is encountered during an emit() call
|
||||
logging.getLogger(module_name).addHandler(logging.NullHandler())
|
||||
logging.getLogger(module_name).propagate = False
|
||||
|
||||
self.user_id = None
|
||||
|
||||
if CONFIG_PATH.exists():
|
||||
# Load the config file
|
||||
try:
|
||||
with open(CONFIG_PATH, "r", encoding="utf-8") as config_file:
|
||||
config = yaml.safe_load(config_file)
|
||||
if "user_id" in config:
|
||||
self.user_id = config["user_id"]
|
||||
except Exception as e:
|
||||
logger.debug("Telemetry could not read the config file %s", CONFIG_PATH, exc_info=e)
|
||||
else:
|
||||
# Create the config file
|
||||
logger.info(
|
||||
"Haystack sends anonymous usage data to understand the actual usage and steer dev efforts "
|
||||
"towards features that are most meaningful to users. You can opt-out at anytime by manually "
|
||||
"setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different "
|
||||
"operating systems in the [documentation page](https://docs.haystack.deepset.ai/docs/telemetry#how-can-i-opt-out). "
|
||||
"More information at [Telemetry](https://docs.haystack.deepset.ai/docs/telemetry)."
|
||||
)
|
||||
CONFIG_PATH.parents[0].mkdir(parents=True, exist_ok=True)
|
||||
self.user_id = str(uuid.uuid4())
|
||||
try:
|
||||
with open(CONFIG_PATH, "w") as outfile:
|
||||
yaml.dump({"user_id": self.user_id}, outfile, default_flow_style=False)
|
||||
except Exception as e:
|
||||
logger.debug("Telemetry could not write config file to %s", CONFIG_PATH, exc_info=e)
|
||||
|
||||
self.event_properties = collect_static_system_specs()
|
||||
|
||||
def send_event(self, event_name: str, event_properties: Optional[Dict[str, Any]] = None):
|
||||
"""
|
||||
Sends a telemetry event.
|
||||
|
||||
:param event_name: The name of the event to show in PostHog.
|
||||
:param event_properties: Additional event metadata. These are merged with the
|
||||
system metadata collected in __init__, so take care not to overwrite them.
|
||||
"""
|
||||
event_properties = event_properties or {}
|
||||
dynamic_specs = collect_dynamic_system_specs()
|
||||
try:
|
||||
posthog.capture(
|
||||
distinct_id=self.user_id,
|
||||
event=event_name,
|
||||
# loads/dumps to sort the keys
|
||||
properties=json.loads(
|
||||
json.dumps({**self.event_properties, **dynamic_specs, **event_properties}, sort_keys=True)
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug("Telemetry couldn't make a POST request to PostHog.", exc_info=e)
|
||||
|
||||
|
||||
def send_pipeline_run_event( # type: ignore
|
||||
event_name: str,
|
||||
pipeline: "Pipeline", # type: ignore
|
||||
query: Optional[str] = None,
|
||||
queries: Optional[List[str]] = None,
|
||||
file_paths: Optional[List[str]] = None,
|
||||
labels: Optional[Union["MultiLabel", List["MultiLabel"]]] = None, # type: ignore
|
||||
documents: Optional[Union[List["Document"], List[List["Document"]]]] = None, # type: ignore
|
||||
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
||||
params: Optional[dict] = None,
|
||||
debug: Optional[bool] = None,
|
||||
):
|
||||
"""
|
||||
Sends a telemetry event about the execution of a pipeline, if telemetry is enabled.
|
||||
|
||||
:param event_name: The name of the event to show in PostHog.
|
||||
:param pipeline: the pipeline that is running
|
||||
:param query: the value of the `query` input of the pipeline, if any
|
||||
:param queries: the value of the `queries` input of the pipeline, if any
|
||||
:param file_paths: the value of the `file_paths` input of the pipeline, if any
|
||||
:param labels: the value of the `labels` input of the pipeline, if any
|
||||
:param documents: the value of the `documents` input of the pipeline, if any
|
||||
:param meta: the value of the `meta` input of the pipeline, if any
|
||||
:param params: the value of the `params` input of the pipeline, if any
|
||||
:param debug: the value of the `debug` input of the pipeline, if any
|
||||
"""
|
||||
try:
|
||||
if telemetry:
|
||||
event_properties: Dict[str, Optional[Union[str, bool, int, Dict[str, Any]]]] = {}
|
||||
|
||||
# Check if it's the public demo
|
||||
exec_context = os.environ.get(HAYSTACK_EXECUTION_CONTEXT, "")
|
||||
if exec_context == "public_demo":
|
||||
event_properties["pipeline.is_public_demo"] = True
|
||||
event_properties["pipeline.run_parameters.query"] = query
|
||||
event_properties["pipeline.run_parameters.params"] = params
|
||||
telemetry.send_event(event_name=event_name, event_properties=event_properties)
|
||||
return
|
||||
|
||||
# Collect pipeline profile
|
||||
event_properties["pipeline.classname"] = pipeline.__class__.__name__
|
||||
event_properties["pipeline.fingerprint"] = pipeline.fingerprint
|
||||
if pipeline.yaml_hash:
|
||||
event_properties["pipeline.yaml_hash"] = pipeline.yaml_hash
|
||||
|
||||
# Add document store
|
||||
docstore = pipeline.get_document_store()
|
||||
if docstore:
|
||||
event_properties["pipeline.document_store"] = docstore.__class__.__name__
|
||||
|
||||
# Add an entry for each node class and classify the pipeline by its root node
|
||||
for node in pipeline.graph.nodes:
|
||||
node_type = pipeline.graph.nodes.get(node)["component"].__class__.__name__
|
||||
if node_type == "RootNode":
|
||||
event_properties["pipeline.type"] = node
|
||||
else:
|
||||
event_properties["pipeline.nodes." + node_type] = (
|
||||
event_properties.get("pipeline.nodes." + node_type, 0) + 1 # type: ignore
|
||||
)
|
||||
|
||||
# Inputs of the run() or run_batch() call
|
||||
if isinstance(labels, list):
|
||||
labels_len = len(labels)
|
||||
else:
|
||||
labels_len = 1 if labels else 0
|
||||
if documents and isinstance(documents, list) and isinstance(documents[0], list):
|
||||
documents_len = [len(docs) if isinstance(docs, list) else 0 for docs in documents]
|
||||
elif isinstance(documents, list):
|
||||
documents_len = [len(documents)]
|
||||
else:
|
||||
documents_len = [0]
|
||||
if meta and isinstance(meta, list):
|
||||
meta_len = len(meta)
|
||||
else:
|
||||
meta_len = 1
|
||||
event_properties["pipeline.run_parameters.queries"] = len(queries) if queries else bool(query)
|
||||
event_properties["pipeline.run_parameters.file_paths"] = len(file_paths or [])
|
||||
event_properties["pipeline.run_parameters.labels"] = labels_len
|
||||
event_properties["pipeline.run_parameters.documents"] = documents_len # type: ignore
|
||||
event_properties["pipeline.run_parameters.meta"] = meta_len
|
||||
event_properties["pipeline.run_parameters.params"] = bool(params)
|
||||
event_properties["pipeline.run_parameters.debug"] = bool(debug)
|
||||
|
||||
telemetry.send_event(event_name=event_name, event_properties=event_properties)
|
||||
except Exception as e:
|
||||
# Never let telemetry break things
|
||||
logger.debug("There was an issue sending a %s telemetry event", event_name, exc_info=e)
|
||||
|
||||
|
||||
def send_pipeline_event(pipeline: "Pipeline", event_name: str): # type: ignore
|
||||
"""
|
||||
Send a telemetry event related to a pipeline which is not a call to run(), if telemetry is enabled.
|
||||
"""
|
||||
try:
|
||||
if telemetry:
|
||||
telemetry.send_event(
|
||||
event_name=event_name,
|
||||
event_properties={
|
||||
"pipeline.classname": pipeline.__class__.__name__,
|
||||
"pipeline.fingerprint": pipeline.fingerprint,
|
||||
"pipeline.yaml_hash": pipeline.yaml_hash,
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
# Never let telemetry break things
|
||||
logger.debug("There was an issue sending a '%s' telemetry event", event_name, exc_info=e)
|
||||
|
||||
|
||||
def send_event(event_name: str, event_properties: Optional[Dict[str, Any]] = None):
|
||||
"""
|
||||
Send a telemetry event, if telemetry is enabled.
|
||||
"""
|
||||
try:
|
||||
if telemetry:
|
||||
telemetry.send_event(event_name=event_name, event_properties=event_properties)
|
||||
except Exception as e:
|
||||
# Never let telemetry break things
|
||||
logger.debug("There was an issue sending a '%s' telemetry event", event_name, exc_info=e)
|
||||
|
||||
|
||||
def _serializer(obj):
|
||||
"""
|
||||
Small function used to build pipeline fingerprints and safely serialize any object.
|
||||
"""
|
||||
try:
|
||||
return str(obj)
|
||||
except:
|
||||
return "~ non serializable object ~"
|
||||
|
||||
|
||||
if os.environ.get("HAYSTACK_TELEMETRY_VERSION", "2") == "2":
|
||||
telemetry = Telemetry()
|
||||
else:
|
||||
telemetry = None # type: ignore
|
||||
@ -1,10 +1,13 @@
|
||||
from typing import Optional, Any, Dict, Union
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional, Any, Dict, Union
|
||||
|
||||
import mlflow
|
||||
from requests.exceptions import ConnectionError
|
||||
|
||||
from haystack import __version__
|
||||
from haystack.environment import get_or_create_env_meta_data
|
||||
|
||||
|
||||
|
||||
@ -12,6 +12,7 @@ import requests
|
||||
|
||||
from haystack.telemetry import send_tutorial_event
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
||||
@ -1,130 +0,0 @@
|
||||
from pathlib import Path
|
||||
|
||||
from unittest.mock import patch, PropertyMock
|
||||
|
||||
import pytest
|
||||
|
||||
from haystack import telemetry
|
||||
from haystack.errors import PipelineSchemaError
|
||||
from haystack.telemetry import (
|
||||
NonPrivateParameters,
|
||||
send_event,
|
||||
enable_writing_events_to_file,
|
||||
disable_writing_events_to_file,
|
||||
send_custom_event,
|
||||
_delete_telemetry_file,
|
||||
disable_telemetry,
|
||||
enable_telemetry,
|
||||
TelemetryFileType,
|
||||
_write_telemetry_config,
|
||||
)
|
||||
|
||||
|
||||
@patch.object(
|
||||
NonPrivateParameters, "param_names", return_value=["top_k", "model_name_or_path"], new_callable=PropertyMock
|
||||
)
|
||||
def test_private_params_not_tracked(mock_nonprivateparameters):
|
||||
params = {"hostname": "private_hostname", "top_k": 2}
|
||||
tracked_params = NonPrivateParameters.apply_filter(params)
|
||||
expected_params = {"top_k": 2}
|
||||
assert tracked_params == expected_params
|
||||
|
||||
|
||||
@patch.object(
|
||||
NonPrivateParameters, "param_names", return_value=["top_k", "model_name_or_path"], new_callable=PropertyMock
|
||||
)
|
||||
def test_non_private_params_tracked(mock_nonprivateparameters):
|
||||
params = {"model_name_or_path": "test-model", "top_k": 2}
|
||||
non_private_params = NonPrivateParameters.apply_filter(params)
|
||||
assert non_private_params == params
|
||||
|
||||
|
||||
@patch.object(NonPrivateParameters, "param_names", return_value=[], new_callable=PropertyMock)
|
||||
def test_only_non_private_params(mock_nonprivateparameters):
|
||||
non_private_params = NonPrivateParameters.apply_filter({"top_k": 2})
|
||||
assert non_private_params == {}
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
@patch("posthog.capture")
|
||||
@patch.object(
|
||||
NonPrivateParameters,
|
||||
"param_names",
|
||||
return_value=["top_k", "model_name_or_path", "add_isolated_node_eval"],
|
||||
new_callable=PropertyMock,
|
||||
)
|
||||
# patches are applied in bottom-up order, which is why mock_nonprivateparameters is the first parameter and mock_posthog_capture is the second
|
||||
def test_send_event_via_decorator(mock_nonprivateparameters, mock_posthog_capture):
|
||||
class TestClass:
|
||||
@send_event
|
||||
def run(self, add_isolated_node_eval: bool = False):
|
||||
pass
|
||||
|
||||
test_class = TestClass()
|
||||
test_class.run(add_isolated_node_eval=True)
|
||||
# todo replace [1] with .kwargs when moving from python 3.7 to 3.8 in CI
|
||||
assert mock_posthog_capture.call_args[1]["event"] == "TestClass.run executed"
|
||||
assert mock_posthog_capture.call_args[1]["properties"]["add_isolated_node_eval"]
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
@patch("posthog.capture")
|
||||
def test_send_event_if_custom_error_raised(mock_posthog_capture):
|
||||
with pytest.raises(PipelineSchemaError):
|
||||
raise PipelineSchemaError
|
||||
# todo replace [1] with .kwargs when moving from python 3.7 to 3.8 in CI
|
||||
assert mock_posthog_capture.call_args[1]["event"] == "PipelineSchemaError raised"
|
||||
|
||||
|
||||
def num_lines(path: Path):
|
||||
if path.is_file():
|
||||
with open(path, "r") as f:
|
||||
return len(f.readlines())
|
||||
return 0
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
@patch("posthog.capture")
|
||||
def test_write_to_file(mock_posthog_capture, monkeypatch):
|
||||
monkeypatch.setattr(telemetry, "LOG_PATH", Path("~/.haystack/telemetry_test.log").expanduser())
|
||||
num_lines_before = num_lines(telemetry.LOG_PATH)
|
||||
send_custom_event(event="test")
|
||||
num_lines_after = num_lines(telemetry.LOG_PATH)
|
||||
assert num_lines_before == num_lines_after
|
||||
|
||||
enable_writing_events_to_file()
|
||||
num_lines_before = num_lines(telemetry.LOG_PATH)
|
||||
send_custom_event(event="test")
|
||||
num_lines_after = num_lines(telemetry.LOG_PATH)
|
||||
assert num_lines_before + 1 == num_lines_after
|
||||
|
||||
disable_writing_events_to_file()
|
||||
num_lines_before = num_lines(telemetry.LOG_PATH)
|
||||
send_custom_event(event="test")
|
||||
num_lines_after = num_lines(telemetry.LOG_PATH)
|
||||
assert num_lines_before == num_lines_after
|
||||
_delete_telemetry_file(TelemetryFileType.LOG_FILE)
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
@patch("posthog.capture")
|
||||
def test_disable_enable_telemetry(mock_posthog_capture, monkeypatch):
|
||||
monkeypatch.setattr(telemetry, "HAYSTACK_TELEMETRY_ENABLED", "HAYSTACK_TELEMETRY_ENABLED_TEST")
|
||||
monkeypatch.setattr(telemetry, "CONFIG_PATH", Path("~/.haystack/config_test.yaml").expanduser())
|
||||
# config_test.yaml doesn't exist yet and won't be created automatically because the global user_id might have been set already by other tests
|
||||
_write_telemetry_config()
|
||||
send_custom_event(event="test")
|
||||
send_custom_event(event="test")
|
||||
assert mock_posthog_capture.call_count == 2, "two events should be sent"
|
||||
|
||||
disable_telemetry()
|
||||
send_custom_event(event="test")
|
||||
assert mock_posthog_capture.call_count == 3, "one additional event should be sent"
|
||||
# todo replace [1] with .kwargs when moving from python 3.7 to 3.8 in CI
|
||||
assert mock_posthog_capture.call_args[1]["event"] == "telemetry disabled", "a final event should be sent"
|
||||
send_custom_event(event="test")
|
||||
assert mock_posthog_capture.call_count == 3, "no additional event should be sent"
|
||||
|
||||
enable_telemetry()
|
||||
send_custom_event(event="test")
|
||||
assert mock_posthog_capture.call_count == 4, "one additional event should be sent"
|
||||
@ -2048,44 +2048,3 @@ def test_fix_to_pipeline_execution_when_join_follows_join():
|
||||
res = pipeline.run(query="Alpha Beta Gamma Delta")
|
||||
documents = res["documents"]
|
||||
assert len(documents) == 4 # all four documents should be found
|
||||
|
||||
|
||||
def test_send_pipeline_event():
|
||||
"""
|
||||
Test the event can be sent and the internal fields are correctly set
|
||||
"""
|
||||
pipeline = Pipeline()
|
||||
pipeline.add_node(MockNode(), name="mock_node", inputs=["Query"])
|
||||
|
||||
with mock.patch("haystack.pipelines.base.send_custom_event") as mocked_send:
|
||||
today_at_midnight = datetime.datetime.combine(datetime.datetime.now(), datetime.time.min, datetime.timezone.utc)
|
||||
pipeline.send_pipeline_event()
|
||||
mocked_send.assert_called_once()
|
||||
assert pipeline.time_of_last_sent_event == today_at_midnight
|
||||
assert pipeline.last_window_run_total == 0
|
||||
|
||||
|
||||
def test_send_pipeline_event_unserializable_param():
|
||||
"""
|
||||
Test the event can be sent even when a certain component was initialized with a
|
||||
non-serializable parameter, see https://github.com/deepset-ai/haystack/issues/3833
|
||||
"""
|
||||
|
||||
class CustomNode(MockNode):
|
||||
"""A mock node that can be inited passing a param"""
|
||||
|
||||
def __init__(self, param):
|
||||
self.param = param
|
||||
|
||||
# create a custom node passing a parameter that can't be serialized (an empty set)
|
||||
custom_node = CustomNode(param=set())
|
||||
|
||||
pipeline = Pipeline()
|
||||
pipeline.add_node(custom_node, name="custom_node", inputs=["Query"])
|
||||
|
||||
with mock.patch("haystack.pipelines.base.send_custom_event") as mocked_send:
|
||||
today_at_midnight = datetime.datetime.combine(datetime.datetime.now(), datetime.time.min, datetime.timezone.utc)
|
||||
pipeline.send_pipeline_event()
|
||||
mocked_send.assert_called_once()
|
||||
assert pipeline.time_of_last_sent_event == today_at_midnight
|
||||
assert pipeline.last_window_run_total == 0
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user