mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-21 03:29:03 +00:00
* Add new audio answer primitives * Add AnswerToSpeech * Add dependency group * Update Documentation & Code Style * Extract TextToSpeech in a helper class, create DocumentToSpeech and primitives * Add tests * Update Documentation & Code Style * Add ability to compress audio and more tests * Add audio group to test, all and all-gpu * fix pylint * Update Documentation & Code Style * Accidental git tag * Try pleasing mypy * Update Documentation & Code Style * fix pylint * Add warning for missing OS library and support in CI * Try fixing mypy * Update Documentation & Code Style * Add docs, simplify args for audio nodes and add tutorials * Fix mypy * Fix run_batch * Feedback on tutorials * fix mypy and pylint * Fix mypy again * Fix mypy yet again * Fix the ci * Fix dicts merge and install ffmpeg on CI * Make the audio nodes import safe * Trying to increase tolerance in audio test * Fix import paths * fix linter * Update Documentation & Code Style * Add audio libs in unit tests * Update _text_to_speech.py * Update answer_to_speech.py * Use dedicated dataset & update telemetry * Remove and use distilled roberta * Revert special primitives so that the nodes run in indexing * Improve tutorials and fix smaller bugs * Update Documentation & Code Style * Fix serialization issue * Update Documentation & Code Style * Improve tutorial * Update Documentation & Code Style * Update _text_to_speech.py * Minor lg updates * Minor lg updates to tutorial * Making indexing work in tutorials * Update Documentation & Code Style * Improve docstrings * Try to use GPU when available * Update Documentation & Code Style * Fixi mypy and pylint * Try to pass the device correctly * Update Documentation & Code Style * Use type of device * use .cpu() * Improve .ipynb * update apt index to be able to download libsndfile1 * Fix SpeechDocument.from_dict() * Change pip URL Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
307 lines
14 KiB
Python
307 lines
14 KiB
Python
"""
|
|
Telemetry
|
|
Haystack reports anonymous usage statistics to support continuous software improvements for all its users.
|
|
An example report can be inspected via calling print_telemetry_report(). Check out the documentation for more details: https://haystack.deepset.ai/guides/telemetry
|
|
You can opt-out of sharing usage statistics by calling disable_telemetry() or by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation page.
|
|
You can log all events to the local file specified in LOG_PATH for inspection by setting the environment variable HAYSTACK_TELEMETRY_LOGGING_TO_FILE_ENABLED to "True".
|
|
"""
|
|
import os
|
|
from typing import Any, Dict, List, Optional
|
|
import uuid
|
|
import logging
|
|
from logging import CRITICAL
|
|
from enum import Enum
|
|
from functools import wraps
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
import posthog
|
|
|
|
from haystack.environment import HAYSTACK_EXECUTION_CONTEXT, get_or_create_env_meta_data
|
|
|
|
posthog.api_key = "phc_F5v11iI2YHkoP6Er3cPILWSrLhY3D6UY4dEMga4eoaa"
|
|
posthog.host = "https://tm.hs.deepset.ai"
|
|
HAYSTACK_TELEMETRY_ENABLED = "HAYSTACK_TELEMETRY_ENABLED"
|
|
HAYSTACK_TELEMETRY_LOGGING_TO_FILE_ENABLED = "HAYSTACK_TELEMETRY_LOGGING_TO_FILE_ENABLED"
|
|
CONFIG_PATH = Path("~/.haystack/config.yaml").expanduser()
|
|
LOG_PATH = Path("~/.haystack/telemetry.log").expanduser()
|
|
|
|
user_id: Optional[str] = None
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# disable posthog logging
|
|
logging.getLogger("posthog").setLevel(CRITICAL)
|
|
logging.getLogger("backoff").setLevel(CRITICAL)
|
|
|
|
|
|
class TelemetryFileType(Enum):
|
|
LOG_FILE: str = "LOG_FILE"
|
|
CONFIG_FILE: str = "CONFIG_FILE"
|
|
|
|
|
|
def print_telemetry_report():
|
|
"""
|
|
Prints the user id and the meta data that are sent in events
|
|
"""
|
|
if is_telemetry_enabled():
|
|
user_id = _get_or_create_user_id()
|
|
meta_data = get_or_create_env_meta_data()
|
|
print({**{"user_id": user_id}, **meta_data})
|
|
else:
|
|
print("Telemetry is disabled.")
|
|
|
|
|
|
def enable_telemetry():
|
|
"""
|
|
Enables telemetry so that a limited amount of anonymous usage data is sent as events.
|
|
"""
|
|
os.environ[HAYSTACK_TELEMETRY_ENABLED] = "True"
|
|
logger.info("Telemetry has been enabled.")
|
|
|
|
|
|
def disable_telemetry():
|
|
"""
|
|
Disables telemetry so that no events are sent anymore, except for one final event.
|
|
"""
|
|
os.environ[HAYSTACK_TELEMETRY_ENABLED] = "False"
|
|
logger.info("Telemetry has been disabled.")
|
|
|
|
|
|
def enable_writing_events_to_file():
|
|
"""
|
|
Enables writing each event that is sent to the log file specified in LOG_PATH
|
|
"""
|
|
os.environ[HAYSTACK_TELEMETRY_LOGGING_TO_FILE_ENABLED] = "True"
|
|
logger.info(f"Writing events to log file {LOG_PATH} has been enabled.")
|
|
|
|
|
|
def disable_writing_events_to_file():
|
|
"""
|
|
Disables writing each event that is sent to the log file specified in LOG_PATH
|
|
"""
|
|
os.environ[HAYSTACK_TELEMETRY_LOGGING_TO_FILE_ENABLED] = "False"
|
|
logger.info(f"Writing events to log file {LOG_PATH} has been disabled.")
|
|
|
|
|
|
def is_telemetry_enabled() -> bool:
|
|
"""
|
|
Returns False if telemetry is disabled via an environment variable, otherwise True.
|
|
"""
|
|
telemetry_environ = os.environ.get(HAYSTACK_TELEMETRY_ENABLED, "True")
|
|
return telemetry_environ.lower() != "false"
|
|
|
|
|
|
def is_telemetry_logging_to_file_enabled() -> bool:
|
|
"""
|
|
Returns False if logging telemetry events to a file is disabled via an environment variable, otherwise True.
|
|
"""
|
|
telemetry_environ = os.environ.get(HAYSTACK_TELEMETRY_LOGGING_TO_FILE_ENABLED, "False")
|
|
return telemetry_environ.lower() != "false"
|
|
|
|
|
|
def send_event_if_public_demo(func):
|
|
"""
|
|
Can be used as a decorator to send an event only if HAYSTACK_EXECUTION_CONTEXT is "public_demo"
|
|
"""
|
|
|
|
@wraps(func)
|
|
def wrapper(*args, **kwargs):
|
|
exec_context = os.environ.get(HAYSTACK_EXECUTION_CONTEXT, "")
|
|
if exec_context == "public_demo":
|
|
send_custom_event(event="demo query executed", payload=kwargs)
|
|
return func(*args, **kwargs)
|
|
|
|
return wrapper
|
|
|
|
|
|
def send_event(func):
|
|
"""
|
|
Can be used as a decorator to send an event formatted like 'Pipeline.eval executed'
|
|
with additional parameters as defined in TrackedParameters ('add_isolated_node_eval') and
|
|
metadata, such as os_version
|
|
"""
|
|
|
|
@wraps(func)
|
|
def wrapper(*args, **kwargs):
|
|
send_custom_event(event=f"{type(args[0]).__name__}.{func.__name__} executed", payload=kwargs)
|
|
return func(*args, **kwargs)
|
|
|
|
return wrapper
|
|
|
|
|
|
def send_custom_event(event: str = "", payload: Dict[str, Any] = {}):
|
|
"""
|
|
This method can be called directly from anywhere in Haystack to send an event.
|
|
Enriches the given event with metadata and sends it to the posthog server if telemetry is enabled.
|
|
If telemetry has just been disabled, a final event is sent and the config file and the log file are deleted
|
|
|
|
:param event: Name of the event. Use a noun and a verb, e.g., "evaluation started", "component created"
|
|
:param payload: A dictionary containing event meta data, e.g., parameter settings
|
|
"""
|
|
global user_id # pylint: disable=global-statement
|
|
try:
|
|
|
|
def send_request(payload: Dict[str, Any]):
|
|
"""
|
|
Prepares and sends an event in a post request to a posthog server
|
|
Sending the post request within posthog.capture is non-blocking.
|
|
|
|
:param payload: A dictionary containing event meta data, e.g., parameter settings
|
|
"""
|
|
event_properties = {**(NonPrivateParameters.apply_filter(payload)), **get_or_create_env_meta_data()}
|
|
if user_id is None:
|
|
raise RuntimeError("User id was not initialized")
|
|
try:
|
|
posthog.capture(distinct_id=user_id, event=event, properties=event_properties)
|
|
except Exception as e:
|
|
logger.debug("Telemetry was not able to make a post request to posthog.", exc_info=e)
|
|
if is_telemetry_enabled() and is_telemetry_logging_to_file_enabled():
|
|
_write_event_to_telemetry_log_file(distinct_id=user_id, event=event, properties=event_properties)
|
|
|
|
user_id = _get_or_create_user_id()
|
|
if is_telemetry_enabled():
|
|
send_request(payload=payload)
|
|
elif CONFIG_PATH.exists():
|
|
# if telemetry has just been disabled but the config file has not been deleted yet,
|
|
# then send a final event instead of the triggered event and delete config file and log file afterward
|
|
event = "telemetry disabled"
|
|
send_request(payload={})
|
|
_delete_telemetry_file(TelemetryFileType.CONFIG_FILE)
|
|
_delete_telemetry_file(TelemetryFileType.LOG_FILE)
|
|
else:
|
|
# return without sending any event, not even a final event
|
|
return
|
|
|
|
except Exception as e:
|
|
logger.debug("Telemetry was not able to send an event.", exc_info=e)
|
|
|
|
|
|
def send_tutorial_event(url: str):
|
|
"""
|
|
Can be called when a tutorial dataset is downloaded so that the dataset URL is used to identify the tutorial and send an event.
|
|
|
|
:param url: URL of the dataset that is loaded in the tutorial.
|
|
"""
|
|
dataset_url_to_tutorial = {
|
|
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip": "1",
|
|
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/squad_small.json.zip": "2",
|
|
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt3.zip": "3",
|
|
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/small_faq_covid.csv.zip": "4",
|
|
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip": "5",
|
|
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt6.zip": "6",
|
|
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/small_generator_dataset.csv.zip": "7",
|
|
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial8.zip": "8",
|
|
# "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-train.json.gz":"9",
|
|
"https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-dev.json.gz": "9",
|
|
"https://fandom-qa.s3-eu-west-1.amazonaws.com/saved_models/hp_v3.4.zip": "10",
|
|
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt11.zip": "11",
|
|
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt12.zip": "12",
|
|
# Tutorial 13: no dataset available yet
|
|
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt14.zip": "14",
|
|
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/table_text_dataset.zip": "15",
|
|
# "https://nlp.stanford.edu/data/glove.6B.zip": "16",
|
|
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial16.zip": "16",
|
|
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt17.zip": "17",
|
|
}
|
|
send_custom_event(event=f"tutorial {dataset_url_to_tutorial.get(url, '?')} executed")
|
|
|
|
|
|
def _get_or_create_user_id() -> str:
|
|
"""
|
|
Randomly generates a user id or loads the id defined in the config file and returns it.
|
|
"""
|
|
global user_id # pylint: disable=global-statement
|
|
if user_id is None:
|
|
# if user_id is not set, read it from config file
|
|
_read_telemetry_config()
|
|
if user_id is None:
|
|
# if user_id cannot be read from config file, create new user_id and write it to config file
|
|
user_id = str(uuid.uuid4())
|
|
_write_telemetry_config()
|
|
return user_id
|
|
|
|
|
|
def _read_telemetry_config():
|
|
"""
|
|
Loads the config from the file specified in CONFIG_PATH
|
|
"""
|
|
global user_id # pylint: disable=global-statement
|
|
try:
|
|
if not CONFIG_PATH.is_file():
|
|
return
|
|
with open(CONFIG_PATH, "r", encoding="utf-8") as stream:
|
|
config = yaml.safe_load(stream)
|
|
if "user_id" in config and user_id is None:
|
|
user_id = config["user_id"]
|
|
except Exception as e:
|
|
logger.debug(f"Telemetry was not able to read the config file {CONFIG_PATH}.", exc_info=e)
|
|
|
|
|
|
def _write_telemetry_config():
|
|
"""
|
|
Writes a config file storing the randomly generated user id and whether to write events to a log file.
|
|
This method logs an info to inform the user about telemetry when it is used for the first time.
|
|
"""
|
|
global user_id # pylint: disable=global-statement
|
|
try:
|
|
# show a log message if telemetry config is written for the first time
|
|
if not CONFIG_PATH.is_file():
|
|
logger.info(
|
|
f"Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by calling disable_telemetry() or by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation page. More information at https://haystack.deepset.ai/guides/telemetry"
|
|
)
|
|
CONFIG_PATH.parents[0].mkdir(parents=True, exist_ok=True)
|
|
user_id = _get_or_create_user_id()
|
|
config = {"user_id": user_id}
|
|
|
|
with open(CONFIG_PATH, "w") as outfile:
|
|
yaml.dump(config, outfile, default_flow_style=False)
|
|
except Exception:
|
|
logger.debug(f"Could not write config file to {CONFIG_PATH}.")
|
|
send_custom_event(event="config saving failed")
|
|
|
|
|
|
def _write_event_to_telemetry_log_file(distinct_id: str, event: str, properties: Dict[str, Any]):
|
|
try:
|
|
with open(LOG_PATH, "a") as file_object:
|
|
file_object.write(f"{event}, {properties}, {distinct_id}\n")
|
|
except Exception as e:
|
|
logger.debug(f"Telemetry was not able to write event to log file {LOG_PATH}.", exc_info=e)
|
|
|
|
|
|
def _delete_telemetry_file(file_type_to_delete: TelemetryFileType):
|
|
"""
|
|
Deletes the telemetry config file or log file if it exists.
|
|
"""
|
|
if not isinstance(file_type_to_delete, TelemetryFileType):
|
|
logger.debug("File type to delete must be either TelemetryFileType.LOG_FILE or TelemetryFileType.CONFIG_FILE.")
|
|
path = LOG_PATH if file_type_to_delete is TelemetryFileType.LOG_FILE else CONFIG_PATH
|
|
try:
|
|
path.unlink() # todo add missing_ok=True to the unlink() call when upgrading to python>3.7
|
|
except Exception as e:
|
|
logger.debug(f"Telemetry was not able to delete the {file_type_to_delete} at {path}.", exc_info=e)
|
|
|
|
|
|
class NonPrivateParameters:
|
|
param_names: List[str] = ["top_k", "model_name_or_path", "add_isolated_node_eval"]
|
|
|
|
@classmethod
|
|
def apply_filter(cls, param_dicts: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Ensures that only the values of non-private parameters are sent in events. All other parameter values are filtered out before sending an event.
|
|
If model_name_or_path is a local file path, it will be reduced to the name of the file. The directory names are not sent.
|
|
|
|
:param param_dicts: the keyword arguments that need to be filtered before sending an event
|
|
"""
|
|
tracked_params = {k: param_dicts[k] for k in cls.param_names if k in param_dicts}
|
|
|
|
# if model_name_or_path is a local file path, we reduce it to the model name
|
|
if "model_name_or_path" in tracked_params:
|
|
if (
|
|
Path(tracked_params["model_name_or_path"]).is_file()
|
|
or tracked_params["model_name_or_path"].count(os.path.sep) > 1
|
|
):
|
|
# if model_name_or_path points to an existing file or contains more than one / it is a path
|
|
tracked_params["model_name_or_path"] = Path(tracked_params["model_name_or_path"]).name
|
|
return tracked_params
|