From 201db5b28813d25066f60e0e710d501623f79682 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 23 Apr 2024 15:42:46 +0200 Subject: [PATCH] docs: fixing all D205 docstring issues (#7577) * fixing all D205 issues * Update haystack/components/embedders/hugging_face_api_document_embedder.py Co-authored-by: Stefano Fiorucci * Update haystack/components/embedders/hugging_face_api_text_embedder.py Co-authored-by: Stefano Fiorucci * Update haystack/components/generators/chat/hugging_face_api.py Co-authored-by: Stefano Fiorucci * Update haystack/components/generators/chat/hugging_face_local.py Co-authored-by: Stefano Fiorucci * Update haystack/components/generators/hugging_face_api.py Co-authored-by: Stefano Fiorucci * fixing 205 issues and attending PR comments * fixing 205 issues and attending PR comments * Update haystack/components/converters/azure.py Co-authored-by: Daria Fokina * Update haystack/components/converters/azure.py Co-authored-by: Daria Fokina * Update haystack/components/extractors/named_entity_extractor.py Co-authored-by: Daria Fokina * Update haystack/components/extractors/named_entity_extractor.py Co-authored-by: Daria Fokina * Update haystack/core/component/component.py Co-authored-by: Daria Fokina * Update haystack/components/evaluators/answer_exact_match.py Co-authored-by: Daria Fokina * Update haystack/core/pipeline/template.py Co-authored-by: Daria Fokina * Update haystack/core/serialization.py Co-authored-by: Daria Fokina * Update haystack/core/serialization.py Co-authored-by: Daria Fokina * Update haystack/core/pipeline/draw.py Co-authored-by: Daria Fokina * Update haystack/components/generators/azure.py Co-authored-by: Daria Fokina * Apply suggestions from code review Co-authored-by: Daria Fokina --------- Co-authored-by: Stefano Fiorucci Co-authored-by: Daria Fokina --- .../components/builders/answer_builder.py | 4 +++- .../builders/dynamic_chat_prompt_builder.py | 12 ++++++---- .../builders/dynamic_prompt_builder.py | 22 ++++++++++------- .../components/builders/prompt_builder.py | 2 ++ haystack/components/caching/cache_checker.py | 6 ++--- .../document_language_classifier.py | 3 +++ .../components/connectors/openapi_service.py | 15 ++++++++---- haystack/components/converters/azure.py | 24 ++++++++++++++----- .../converters/openapi_functions.py | 4 ++++ haystack/components/converters/utils.py | 3 +++ .../hugging_face_api_document_embedder.py | 2 ++ .../hugging_face_api_text_embedder.py | 2 ++ .../evaluators/answer_exact_match.py | 13 ++++++---- .../components/evaluators/document_map.py | 5 +++- .../components/evaluators/document_recall.py | 2 ++ .../components/evaluators/llm_evaluator.py | 2 ++ .../components/evaluators/sas_evaluator.py | 3 +++ .../extractors/named_entity_extractor.py | 17 ++++++------- haystack/components/fetchers/link_content.py | 5 ++++ haystack/components/generators/azure.py | 7 ++++-- haystack/components/generators/chat/azure.py | 4 ++++ .../generators/chat/hugging_face_api.py | 2 ++ .../generators/chat/hugging_face_local.py | 4 ++++ .../generators/chat/hugging_face_tgi.py | 4 ++++ haystack/components/generators/chat/openai.py | 8 +++++++ .../components/generators/hugging_face_api.py | 2 ++ haystack/components/generators/openai.py | 3 +++ haystack/components/generators/utils.py | 3 +++ .../components/joiners/document_joiner.py | 1 + haystack/components/others/multiplexer.py | 4 ++++ .../preprocessors/document_cleaner.py | 13 ++++++++++ .../preprocessors/document_splitter.py | 4 ++++ .../components/preprocessors/text_cleaner.py | 10 +++++--- .../components/rankers/lost_in_the_middle.py | 4 ++++ haystack/components/rankers/meta_field.py | 8 +++++-- .../sentence_transformers_diversity.py | 2 ++ haystack/components/readers/extractive.py | 2 ++ .../components/routers/conditional_router.py | 2 ++ .../components/routers/file_type_router.py | 4 ++++ .../components/routers/metadata_router.py | 2 ++ .../routers/text_language_router.py | 3 +++ .../routers/zero_shot_text_router.py | 8 +++++-- haystack/components/samplers/top_p.py | 2 ++ haystack/components/validators/json_schema.py | 7 ++++-- haystack/components/websearch/searchapi.py | 2 ++ haystack/components/websearch/serper_dev.py | 2 ++ .../components/writers/document_writer.py | 1 + haystack/core/component/component.py | 23 +++++++++--------- haystack/core/pipeline/draw.py | 4 ++-- haystack/core/pipeline/pipeline.py | 13 ++++++---- haystack/core/pipeline/template.py | 15 +++++++----- haystack/core/serialization.py | 5 ++-- haystack/dataclasses/sparse_embedding.py | 6 ++--- 53 files changed, 247 insertions(+), 83 deletions(-) diff --git a/haystack/components/builders/answer_builder.py b/haystack/components/builders/answer_builder.py index b28e99e94..08e420a6e 100644 --- a/haystack/components/builders/answer_builder.py +++ b/haystack/components/builders/answer_builder.py @@ -10,6 +10,7 @@ logger = logging.getLogger(__name__) class AnswerBuilder: """ Takes a query and the replies a Generator returns as input and parses them into GeneratedAnswer objects. + Optionally, it also takes Documents and metadata from the Generator as inputs to enrich the GeneratedAnswer objects. Usage example: @@ -126,9 +127,10 @@ class AnswerBuilder: def _extract_answer_string(reply: str, pattern: Optional[str] = None) -> str: """ Extract the answer string from the generator output using the specified pattern. + If no pattern is specified, the whole string is used as the answer. - :param replies: + :param reply: The output of the Generator. A string. :param pattern: The regular expression pattern to use to extract the answer text from the generator output. diff --git a/haystack/components/builders/dynamic_chat_prompt_builder.py b/haystack/components/builders/dynamic_chat_prompt_builder.py index 2ca28e68e..06fa5f612 100644 --- a/haystack/components/builders/dynamic_chat_prompt_builder.py +++ b/haystack/components/builders/dynamic_chat_prompt_builder.py @@ -11,10 +11,12 @@ logger = logging.getLogger(__name__) @component class DynamicChatPromptBuilder: """ - DynamicChatPromptBuilder is designed to construct dynamic prompts from a list of `ChatMessage` instances. It - integrates with Jinja2 templating for dynamic prompt generation. It considers any user or system message in the list - potentially containing a template and renders it with variables provided to the constructor. Additional template - variables can be feed into the component/pipeline `run` method and will be merged before rendering the template. + DynamicChatPromptBuilder is designed to construct dynamic prompts from a list of `ChatMessage` instances. + + It integrates with Jinja2 templating for dynamic prompt generation. It considers any user or system message in the + list potentially containing a template and renders it with variables provided to the constructor. Additional + template variables can be feed into the component/pipeline `run` method and will be merged before rendering the + template. Usage example: ```python @@ -92,6 +94,7 @@ class DynamicChatPromptBuilder: def run(self, prompt_source: List[ChatMessage], template_variables: Optional[Dict[str, Any]] = None, **kwargs): """ Executes the dynamic prompt building process by processing a list of `ChatMessage` instances. + Any user message or system message is inspected for templates and rendered with the variables provided to the constructor. You can provide additional template variables directly to this method, which are then merged with the variables provided to the constructor. @@ -151,6 +154,7 @@ class DynamicChatPromptBuilder: def _validate_template(self, template_text: str, provided_variables: Set[str]): """ Checks if all the required template variables are provided to the pipeline `run` method. + If all the required template variables are provided, returns a Jinja2 template object. Otherwise, raises a ValueError. diff --git a/haystack/components/builders/dynamic_prompt_builder.py b/haystack/components/builders/dynamic_prompt_builder.py index 3580ecd8b..dc7185449 100644 --- a/haystack/components/builders/dynamic_prompt_builder.py +++ b/haystack/components/builders/dynamic_prompt_builder.py @@ -10,8 +10,10 @@ logger = logging.getLogger(__name__) @component class DynamicPromptBuilder: """ - DynamicPromptBuilder is designed to construct dynamic prompts for the pipeline. Users can change the prompt - template at runtime by providing a new template for each pipeline run invocation if needed. + DynamicPromptBuilder is designed to construct dynamic prompts for the pipeline. + + Users can change the prompt template at runtime by providing a new template for each pipeline run invocation + if needed. Usage example: ```python @@ -92,12 +94,15 @@ class DynamicPromptBuilder: def run(self, prompt_source: str, template_variables: Optional[Dict[str, Any]] = None, **kwargs): """ - Executes the dynamic prompt building process. Depending on the provided type of `prompt_source`, this method - either processes a list of `ChatMessage` instances or a string template. In the case of `ChatMessage` instances, - the last user message is treated as a template and rendered with the resolved pipeline variables and any - additional template variables provided. For a string template, it directly applies the template variables to - render the final prompt. You can provide additional template variables directly to this method, that are then - merged with the variables resolved from the pipeline runtime. + Executes the dynamic prompt building process. + + Depending on the provided type of `prompt_source`, this method either processes a list of `ChatMessage` + instances or a string template. In the case of `ChatMessage` instances, the last user message is treated as a + template and rendered with the resolved pipeline variables and any additional template variables provided. + + For a string template, it directly applies the template variables to render the final prompt. You can provide + additional template variables directly to this method, that are then merged with the variables resolved from + the pipeline runtime. :param prompt_source: A string template. @@ -127,6 +132,7 @@ class DynamicPromptBuilder: def _validate_template(self, template_text: str, provided_variables: Set[str]): """ Checks if all the required template variables are provided to the pipeline `run` method. + If all the required template variables are provided, returns a Jinja2 template object. Otherwise, raises a ValueError. diff --git a/haystack/components/builders/prompt_builder.py b/haystack/components/builders/prompt_builder.py index 64b85d76a..900463bde 100644 --- a/haystack/components/builders/prompt_builder.py +++ b/haystack/components/builders/prompt_builder.py @@ -40,6 +40,8 @@ class PromptBuilder: @component.output_types(prompt=str) def run(self, **kwargs): """ + Renders the prompt template with the provided variables. + :param kwargs: The variables that will be used to render the prompt template. diff --git a/haystack/components/caching/cache_checker.py b/haystack/components/caching/cache_checker.py index 9f7f58b73..8d88f765b 100644 --- a/haystack/components/caching/cache_checker.py +++ b/haystack/components/caching/cache_checker.py @@ -10,8 +10,7 @@ logger = logging.getLogger(__name__) @component class CacheChecker: """ - Checks for the presence of documents in a Document Store based on a specified - field in each document's metadata. + Checks for the presence of documents in a Document Store based on a specified field in each document's metadata. If matching documents are found, they are returned as hits. If not, the items are returned as misses, indicating they are not in the cache. @@ -92,8 +91,7 @@ class CacheChecker: @component.output_types(hits=List[Document], misses=List) def run(self, items: List[Any]): """ - Checks if any document associated with the specified cache field - is already present in the store. + Checks if any document associated with the specified cache field is already present in the store. :param items: Values to be checked against the cache field. diff --git a/haystack/components/classifiers/document_language_classifier.py b/haystack/components/classifiers/document_language_classifier.py index 6f0108589..daa242c4c 100644 --- a/haystack/components/classifiers/document_language_classifier.py +++ b/haystack/components/classifiers/document_language_classifier.py @@ -50,6 +50,8 @@ class DocumentLanguageClassifier: def __init__(self, languages: Optional[List[str]] = None): """ + Initialize the DocumentLanguageClassifier. + :param languages: A list of languages in ISO code, each corresponding to a different output connection. For supported languages, see the [`langdetect` documentation](https://github.com/Mimino666/langdetect#languages). If not specified, the default is ["en"]. @@ -63,6 +65,7 @@ class DocumentLanguageClassifier: def run(self, documents: List[Document]): """ This method classifies the documents' language and adds it to their metadata. + If a Document's text does not match any of the languages specified at initialization, the metadata value "unmatched" will be stored. diff --git a/haystack/components/connectors/openapi_service.py b/haystack/components/connectors/openapi_service.py index f16288b48..6fa2be055 100644 --- a/haystack/components/connectors/openapi_service.py +++ b/haystack/components/connectors/openapi_service.py @@ -16,6 +16,8 @@ with LazyImport("Run 'pip install openapi3'") as openapi_imports: @component class OpenAPIServiceConnector: """ + A component which connects the Haystack framework to OpenAPI services. + The `OpenAPIServiceConnector` component connects the Haystack framework to OpenAPI services, enabling it to call operations as defined in the OpenAPI specification of the service. @@ -77,8 +79,10 @@ class OpenAPIServiceConnector: service_credentials: Optional[Union[dict, str]] = None, ) -> Dict[str, List[ChatMessage]]: """ - Processes a list of chat messages to invoke a method on an OpenAPI service. It parses the last message in the - list, expecting it to contain an OpenAI function calling descriptor (name & parameters) in JSON format. + Processes a list of chat messages to invoke a method on an OpenAPI service. + + It parses the last message in the list, expecting it to contain an OpenAI function calling descriptor + (name & parameters) in JSON format. :param messages: A list of `ChatMessage` objects containing the messages to be processed. The last message should contain the function invocation payload in OpenAI function calling format. See the example in the class @@ -148,6 +152,8 @@ class OpenAPIServiceConnector: def _authenticate_service(self, openapi_service: OpenAPI, credentials: Optional[Union[dict, str]] = None): """ + Authentication with an OpenAPI service. + Authenticates with the OpenAPI service if required, supporting both single (str) and multiple authentication methods (dict). @@ -201,8 +207,9 @@ class OpenAPIServiceConnector: def _invoke_method(self, openapi_service: OpenAPI, method_invocation_descriptor: Dict[str, Any]) -> Any: """ - Invokes the specified method on the OpenAPI service. The method name and arguments are passed in the - method_invocation_descriptor. + Invokes the specified method on the OpenAPI service. + + The method name and arguments are passed in the method_invocation_descriptor. :param openapi_service: The OpenAPI service instance. :param method_invocation_descriptor: The method name and arguments to be passed to the method. The payload diff --git a/haystack/components/converters/azure.py b/haystack/components/converters/azure.py index 60b41fd5b..db8003be2 100644 --- a/haystack/components/converters/azure.py +++ b/haystack/components/converters/azure.py @@ -23,7 +23,8 @@ with LazyImport(message="Run 'pip install \"azure-ai-formrecognizer>=3.2.0b2\"'" @component class AzureOCRDocumentConverter: """ - A component for converting files to Documents using Azure's Document Intelligence service. + Convert files to documents using Azure's Document Intelligence service. + Supported file formats are: PDF, JPEG, PNG, BMP, TIFF, DOCX, XLSX, PPTX, and HTML. In order to be able to use this component, you need an active Azure account @@ -170,6 +171,8 @@ class AzureOCRDocumentConverter: # pylint: disable=line-too-long def _convert_tables_and_text(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]]) -> List[Document]: """ + Converts the tables and text extracted by Azure's Document Intelligence service into Haystack Documents. + :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult). :param meta: Optional dictionary with metadata that shall be attached to all resulting documents. @@ -188,6 +191,7 @@ class AzureOCRDocumentConverter: def _convert_tables(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]]) -> List[Document]: """ Converts the tables extracted by Azure's Document Intelligence service into Haystack Documents. + :param result: The AnalyzeResult Azure object :param meta: Optional dictionary with metadata that shall be attached to all resulting documents. @@ -296,8 +300,10 @@ class AzureOCRDocumentConverter: def _convert_to_natural_text(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]]) -> Document: """ - This converts the `AnalyzeResult` object into a single Document. We add "\f" separators between to - differentiate between the text on separate pages. This is the expected format for the PreProcessor. + This converts the `AnalyzeResult` object into a single document. + + We add "\f" separators between to differentiate between the text on separate pages. This is the expected format + for the PreProcessor. :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult). @@ -340,8 +346,10 @@ class AzureOCRDocumentConverter: self, result: "AnalyzeResult", meta: Optional[Dict[str, str]], threshold_y: float = 0.05 ) -> Document: """ - This converts the `AnalyzeResult` object into a single Haystack Document. We add "\f" separators between to - differentiate between the text on separate pages. This is the expected format for the PreProcessor. + This converts the `AnalyzeResult` object into a single Haystack Document. + + We add "\f" separators between to differentiate between the text on separate pages. This is the expected format + for the PreProcessor. :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult). @@ -427,6 +435,7 @@ class AzureOCRDocumentConverter: def _collect_table_spans(self, result: "AnalyzeResult") -> Dict: """ Collect the spans of all tables by page number. + :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. :returns: A dictionary with the page number as key and a list of table spans as value. """ @@ -443,6 +452,7 @@ class AzureOCRDocumentConverter: ) -> bool: """ Check if a line or paragraph is part of a table. + :param tables_on_page: A dictionary with the page number as key and a list of table spans as value. :param line_or_paragraph: The line or paragraph to check. :returns: True if the line or paragraph is part of a table, False otherwise. @@ -457,7 +467,9 @@ class AzureOCRDocumentConverter: def _hash_dataframe(self, df: pd.DataFrame, desired_samples=5, hash_length=4) -> str: """ - Returns a hash of the DataFrame content. The hash is based on the content of the DataFrame. + Returns a hash of the DataFrame content. + + The hash is based on the content of the DataFrame. :param df: The DataFrame to hash. :param desired_samples: The desired number of samples to hash. :param hash_length: The length of the hash for each sample. diff --git a/haystack/components/converters/openapi_functions.py b/haystack/components/converters/openapi_functions.py index 30403ace4..e9c6f5392 100644 --- a/haystack/components/converters/openapi_functions.py +++ b/haystack/components/converters/openapi_functions.py @@ -112,6 +112,8 @@ class OpenAPIServiceToFunctions: def _openapi_to_functions(self, service_openapi_spec: Dict[str, Any]) -> List[Dict[str, Any]]: """ + OpenAPI to OpenAI function conversion. + Extracts functions from the OpenAPI specification of the service and converts them into a format suitable for OpenAI function calling. @@ -188,6 +190,8 @@ class OpenAPIServiceToFunctions: self, property_schema: Dict[str, Any], include_attributes: Optional[List[str]] = None ) -> Dict[str, Any]: """ + Parses the attributes of a property schema. + Recursively parses the attributes of a property schema, including nested objects and arrays, and includes specified attributes like description, pattern, etc. diff --git a/haystack/components/converters/utils.py b/haystack/components/converters/utils.py index 8666722a7..30591cd13 100644 --- a/haystack/components/converters/utils.py +++ b/haystack/components/converters/utils.py @@ -7,6 +7,7 @@ from haystack.dataclasses import ByteStream def get_bytestream_from_source(source: Union[str, Path, ByteStream]) -> ByteStream: """ Creates a ByteStream object from a source. + :param source: A source to convert to a ByteStream. Can be a string (path to a file), a Path object, or a ByteStream. :return: A ByteStream object. """ @@ -24,6 +25,8 @@ def normalize_metadata( meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], sources_count: int ) -> List[Dict[str, Any]]: """ + Normalize the metadata input for a converter. + Given all the possible value of the meta input for a converter (None, dictionary or list of dicts), makes sure to return a list of dictionaries of the correct length for the converter to use. diff --git a/haystack/components/embedders/hugging_face_api_document_embedder.py b/haystack/components/embedders/hugging_face_api_document_embedder.py index 3f8ebfba0..d6bcb6c52 100644 --- a/haystack/components/embedders/hugging_face_api_document_embedder.py +++ b/haystack/components/embedders/hugging_face_api_document_embedder.py @@ -19,6 +19,8 @@ logger = logging.getLogger(__name__) @component class HuggingFaceAPIDocumentEmbedder: """ + A component that embeds documents using Hugging Face APIs. + This component can be used to compute Document embeddings using different Hugging Face APIs: - [Free Serverless Inference API]((https://huggingface.co/inference-api) - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints) diff --git a/haystack/components/embedders/hugging_face_api_text_embedder.py b/haystack/components/embedders/hugging_face_api_text_embedder.py index de7c3097b..7404f2839 100644 --- a/haystack/components/embedders/hugging_face_api_text_embedder.py +++ b/haystack/components/embedders/hugging_face_api_text_embedder.py @@ -16,6 +16,8 @@ logger = logging.getLogger(__name__) @component class HuggingFaceAPITextEmbedder: """ + A component that embeds text using Hugging Face APIs. + This component can be used to embed strings using different Hugging Face APIs: - [Free Serverless Inference API]((https://huggingface.co/inference-api) - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints) diff --git a/haystack/components/evaluators/answer_exact_match.py b/haystack/components/evaluators/answer_exact_match.py index db5e72259..bbab895bd 100644 --- a/haystack/components/evaluators/answer_exact_match.py +++ b/haystack/components/evaluators/answer_exact_match.py @@ -6,11 +6,13 @@ from haystack.core.component import component @component class AnswerExactMatchEvaluator: """ - Evaluator that checks if predicted answers exactly match ground truth answers. + An answer exact match evaluator class. + + The evaluator that checks if the predicted answers matches any of the ground truth answers exactly. + The result is a number from 0.0 to 1.0, it represents the proportion of predicted answers + that matched one of the ground truth answers. + There can be multiple ground truth answers and multiple predicted answers as input. - Each predicted answer is compared to one ground truth answer. - The final score is a number ranging from 0.0 to 1.0. - It represents the proportion of predicted answers that match their corresponding ground truth answer. Usage example: ```python @@ -33,7 +35,8 @@ class AnswerExactMatchEvaluator: def run(self, ground_truth_answers: List[str], predicted_answers: List[str]) -> Dict[str, Any]: """ Run the AnswerExactMatchEvaluator on the given inputs. - `ground_truth_answers` and `retrieved_answers` must have the same length. + + The `ground_truth_answers` and `retrieved_answers` must have the same length. :param ground_truth_answers: A list of expected answers. diff --git a/haystack/components/evaluators/document_map.py b/haystack/components/evaluators/document_map.py index 303d7c4df..d87ad6124 100644 --- a/haystack/components/evaluators/document_map.py +++ b/haystack/components/evaluators/document_map.py @@ -6,6 +6,8 @@ from haystack import Document, component @component class DocumentMAPEvaluator: """ + A Mean Average Precision (MAP) evaluator for documents. + Evaluator that calculates the mean average precision of the retrieved documents, a metric that measures how high retrieved documents are ranked. Each question can have multiple ground truth documents and multiple retrieved documents. @@ -43,6 +45,7 @@ class DocumentMAPEvaluator: ) -> Dict[str, Any]: """ Run the DocumentMAPEvaluator on the given inputs. + All lists must have the same length. :param ground_truth_documents: @@ -52,7 +55,7 @@ class DocumentMAPEvaluator: :returns: A dictionary with the following outputs: - `score` - The average of calculated scores. - - `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents how high retrieved documents are ranked. + - `individual_scores` - A list of numbers from 0.0 to 1.0 that represents how high retrieved documents are ranked. """ if len(ground_truth_documents) != len(retrieved_documents): msg = "The length of ground_truth_documents and retrieved_documents must be the same." diff --git a/haystack/components/evaluators/document_recall.py b/haystack/components/evaluators/document_recall.py index 3bd9a767b..65472bcd2 100644 --- a/haystack/components/evaluators/document_recall.py +++ b/haystack/components/evaluators/document_recall.py @@ -32,6 +32,7 @@ class RecallMode(Enum): class DocumentRecallEvaluator: """ Evaluator that calculates the Recall score for a list of documents. + Returns both a list of scores for each question and the average. There can be multiple ground truth documents and multiple predicted documents as input. @@ -91,6 +92,7 @@ class DocumentRecallEvaluator: ) -> Dict[str, Any]: """ Run the DocumentRecallEvaluator on the given inputs. + `ground_truth_documents` and `retrieved_documents` must have the same length. :param ground_truth_documents: diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index 34a69e3b1..e035c4073 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -178,6 +178,8 @@ class LLMEvaluator: def prepare_template(self) -> str: """ + Prepare the prompt template. + Combine instructions, inputs, outputs, and examples into one prompt template with the following format: Instructions: diff --git a/haystack/components/evaluators/sas_evaluator.py b/haystack/components/evaluators/sas_evaluator.py index 6590d25d7..d7ab26a4d 100644 --- a/haystack/components/evaluators/sas_evaluator.py +++ b/haystack/components/evaluators/sas_evaluator.py @@ -16,6 +16,7 @@ with LazyImport(message="Run 'pip install scikit-learn \"sentence-transformers>= class SASEvaluator: """ SASEvaluator computes the Semantic Answer Similarity (SAS) between a list of predictions and a list of ground truths. + It's usually used in Retrieval Augmented Generation (RAG) pipelines to evaluate the quality of the generated answers. The SAS is computed using a pre-trained model from the Hugging Face model hub. The model can be either a @@ -132,6 +133,8 @@ class SASEvaluator: @component.output_types(score=float, individual_scores=List[float]) def run(self, ground_truth_answers: List[str], predicted_answers: List[str]) -> Dict[str, Any]: """ + SASEvaluator component run method. + Run the SASEvaluator to compute the Semantic Answer Similarity (SAS) between a list of predicted answers and a list of ground truth answers. Both must be list of strings of same length. diff --git a/haystack/components/extractors/named_entity_extractor.py b/haystack/components/extractors/named_entity_extractor.py index 5eb78e075..1c994fc6b 100644 --- a/haystack/components/extractors/named_entity_extractor.py +++ b/haystack/components/extractors/named_entity_extractor.py @@ -159,8 +159,7 @@ class NamedEntityExtractor: @component.output_types(documents=List[Document]) def run(self, documents: List[Document], batch_size: int = 1) -> Dict[str, Any]: """ - Annotate named entities in each document and store - the annotations in the document's metadata. + Annotate named entities in each document and store the annotations in the document's metadata. :param documents: Documents to process. @@ -227,8 +226,7 @@ class NamedEntityExtractor: @classmethod def get_stored_annotations(cls, document: Document) -> Optional[List[NamedEntityAnnotation]]: """ - Returns the document's named entity annotations stored - in its metadata, if any. + Returns the document's named entity annotations stored in its metadata, if any. :param document: Document whose annotations are to be fetched. @@ -259,16 +257,14 @@ class _NerBackend(ABC): @abstractmethod def initialize(self): """ - Initializes the backend. This would usually - entail loading models, pipelines, etc. + Initializes the backend. This would usually entail loading models, pipelines, and so on. """ @property @abstractmethod def initialized(self) -> bool: """ - Returns if the backend has been initialized, i.e, - ready to annotate text. + Returns if the backend has been initialized, for example, ready to annotate text. """ @abstractmethod @@ -295,6 +291,8 @@ class _NerBackend(ABC): @property def device(self) -> ComponentDevice: """ + The device on which the backend's model is loaded. + :returns: The device on which the backend's model is loaded. """ @@ -457,8 +455,7 @@ class _SpacyBackend(_NerBackend): @contextmanager def _select_device(self): """ - Context manager used to run spaCy models on a specific - GPU in a scoped manner. + Context manager used to run spaCy models on a specific GPU in a scoped manner. """ # TODO: This won't restore the active device. diff --git a/haystack/components/fetchers/link_content.py b/haystack/components/fetchers/link_content.py index 2ccf81e6b..574af5be7 100644 --- a/haystack/components/fetchers/link_content.py +++ b/haystack/components/fetchers/link_content.py @@ -26,6 +26,8 @@ REQUEST_HEADERS = { def _text_content_handler(response: Response) -> ByteStream: """ + Handles text content. + :param response: Response object from the request. :return: The extracted text. """ @@ -34,6 +36,8 @@ def _text_content_handler(response: Response) -> ByteStream: def _binary_content_handler(response: Response) -> ByteStream: """ + Handles binary content. + :param response: Response object from the request. :return: The extracted binary file-like object. """ @@ -211,6 +215,7 @@ class LinkContentFetcher: def _switch_user_agent(self, retry_state: RetryCallState) -> None: """ Switches the User-Agent for this LinkContentRetriever to the next one in the list of user agents. + Used by tenacity to retry the requests with a different user agent. :param retry_state: The retry state (unused, required by tenacity). diff --git a/haystack/components/generators/azure.py b/haystack/components/generators/azure.py index fbe23de00..5cd7a1430 100644 --- a/haystack/components/generators/azure.py +++ b/haystack/components/generators/azure.py @@ -14,8 +14,9 @@ logger = logging.getLogger(__name__) class AzureOpenAIGenerator(OpenAIGenerator): """ - Enables text generation using OpenAI's large language models (LLMs) on Azure. It supports gpt-4 and gpt-3.5-turbo - family of models. + A Generator component that uses OpenAI's large language models (LLMs) on Azure to generate text. + + It supports gpt-4 and gpt-3.5-turbo family of models. Users can pass any text generation parameters valid for the `openai.ChatCompletion.create` method directly to this component via the `**generation_kwargs` parameter in __init__ or the `**generation_kwargs` @@ -59,6 +60,8 @@ class AzureOpenAIGenerator(OpenAIGenerator): generation_kwargs: Optional[Dict[str, Any]] = None, ): """ + Initialize the Azure OpenAI Generator. + :param azure_endpoint: The endpoint of the deployed model, e.g. `https://example-resource.azure.openai.com/` :param api_version: The version of the API to use. Defaults to 2023-05-15 :param azure_deployment: The deployment of the model, usually the model name. diff --git a/haystack/components/generators/chat/azure.py b/haystack/components/generators/chat/azure.py index 6a1e8fb64..e1d3029ee 100644 --- a/haystack/components/generators/chat/azure.py +++ b/haystack/components/generators/chat/azure.py @@ -14,6 +14,8 @@ logger = logging.getLogger(__name__) class AzureOpenAIChatGenerator(OpenAIChatGenerator): """ + A Chat Generator component that uses the Azure OpenAI API to generate text. + Enables text generation using OpenAI's large language models (LLMs) on Azure. It supports `gpt-4` and `gpt-3.5-turbo` family of models accessed through the chat completions API endpoint. @@ -76,6 +78,8 @@ class AzureOpenAIChatGenerator(OpenAIChatGenerator): generation_kwargs: Optional[Dict[str, Any]] = None, ): """ + Initialize the Azure OpenAI Chat Generator component. + :param azure_endpoint: The endpoint of the deployed model, e.g. `"https://example-resource.azure.openai.com/"` :param api_version: The version of the API to use. Defaults to 2023-05-15 :param azure_deployment: The deployment of the model, usually the model name. diff --git a/haystack/components/generators/chat/hugging_face_api.py b/haystack/components/generators/chat/hugging_face_api.py index 8cdb8dc66..eac3877ac 100644 --- a/haystack/components/generators/chat/hugging_face_api.py +++ b/haystack/components/generators/chat/hugging_face_api.py @@ -17,6 +17,8 @@ logger = logging.getLogger(__name__) @component class HuggingFaceAPIChatGenerator: """ + A Chat Generator component that uses Hugging Face APIs to generate text. + This component can be used to generate text using different Hugging Face APIs with the ChatMessage format: - [Free Serverless Inference API](https://huggingface.co/inference-api) - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints) diff --git a/haystack/components/generators/chat/hugging_face_local.py b/haystack/components/generators/chat/hugging_face_local.py index dfdb087d1..ebb8612b8 100644 --- a/haystack/components/generators/chat/hugging_face_local.py +++ b/haystack/components/generators/chat/hugging_face_local.py @@ -32,6 +32,8 @@ PIPELINE_SUPPORTED_TASKS = ["text-generation", "text2text-generation"] @component class HuggingFaceLocalChatGenerator: """ + A Chat Generator component that uses models available on Hugging Face Hub to generate chat responses locally. + The `HuggingFaceLocalChatGenerator` class is a component designed for generating chat responses using models from Hugging Face's model hub. It is tailored for local runtime text generation tasks and provides a convenient interface for working with chat-based models, such as `HuggingFaceH4/zephyr-7b-beta` or `meta-llama/Llama-2-7b-chat-hf` @@ -78,6 +80,8 @@ class HuggingFaceLocalChatGenerator: streaming_callback: Optional[Callable[[StreamingChunk], None]] = None, ): """ + Initializes the HuggingFaceLocalChatGenerator component. + :param model: The name or path of a Hugging Face model for text generation, for example, `mistralai/Mistral-7B-Instruct-v0.2`, `TheBloke/OpenHermes-2.5-Mistral-7B-16k-AWQ`, etc. The important aspect of the model is that it should be a chat model and that it supports ChatML messaging diff --git a/haystack/components/generators/chat/hugging_face_tgi.py b/haystack/components/generators/chat/hugging_face_tgi.py index 9d5fa752b..095646863 100644 --- a/haystack/components/generators/chat/hugging_face_tgi.py +++ b/haystack/components/generators/chat/hugging_face_tgi.py @@ -24,6 +24,8 @@ logger = logging.getLogger(__name__) @component class HuggingFaceTGIChatGenerator: """ + A Chat-based text generation component using Hugging Face's Text Generation Inference (TGI) framework. + Enables text generation using HuggingFace Hub hosted chat-based LLMs. This component is designed to seamlessly inference chat-based models deployed on the Text Generation Inference (TGI) backend. @@ -147,6 +149,8 @@ class HuggingFaceTGIChatGenerator: def warm_up(self) -> None: """ + Warm up the tokenizer by loading it from the model. + If the url is not provided, check if the model is deployed on the free tier of the HF inference API. Load the tokenizer """ diff --git a/haystack/components/generators/chat/openai.py b/haystack/components/generators/chat/openai.py index d05ed1d8b..836989219 100644 --- a/haystack/components/generators/chat/openai.py +++ b/haystack/components/generators/chat/openai.py @@ -17,6 +17,8 @@ logger = logging.getLogger(__name__) @component class OpenAIChatGenerator: """ + A Chat Generator component that uses the OpenAI API to generate text. + Enables text generation using OpenAI's large language models (LLMs). It supports `gpt-4` and `gpt-3.5-turbo` family of models accessed through the chat completions API endpoint. @@ -71,6 +73,8 @@ class OpenAIChatGenerator: generation_kwargs: Optional[Dict[str, Any]] = None, ): """ + Initializes the OpenAIChatGenerator component. + Creates an instance of OpenAIChatGenerator. Unless specified otherwise in the `model`, this is for OpenAI's GPT-3.5 model. @@ -206,6 +210,7 @@ class OpenAIChatGenerator: def _connect_chunks(self, chunk: Any, chunks: List[StreamingChunk]) -> ChatMessage: """ Connects the streaming chunks into a single ChatMessage. + :param chunk: The last chunk returned by the OpenAI API. :param chunks: The list of all chunks returned by the OpenAI API. """ @@ -256,6 +261,7 @@ class OpenAIChatGenerator: def _build_message(self, completion: ChatCompletion, choice: Choice) -> ChatMessage: """ Converts the non-streaming response from the OpenAI API to a ChatMessage. + :param completion: The completion returned by the OpenAI API. :param choice: The choice returned by the OpenAI API. :return: The ChatMessage. @@ -287,6 +293,7 @@ class OpenAIChatGenerator: def _build_chunk(self, chunk: ChatCompletionChunk) -> StreamingChunk: """ Converts the streaming response chunk from the OpenAI API to a StreamingChunk. + :param chunk: The chunk returned by the OpenAI API. :param choice: The choice returned by the OpenAI API. :return: The StreamingChunk. @@ -311,6 +318,7 @@ class OpenAIChatGenerator: def _check_finish_reason(self, message: ChatMessage) -> None: """ Check the `finish_reason` returned with the OpenAI completions. + If the `finish_reason` is `length` or `content_filter`, log a warning. :param message: The message returned by the LLM. """ diff --git a/haystack/components/generators/hugging_face_api.py b/haystack/components/generators/hugging_face_api.py index 803f432a6..a6d34431c 100644 --- a/haystack/components/generators/hugging_face_api.py +++ b/haystack/components/generators/hugging_face_api.py @@ -23,6 +23,8 @@ logger = logging.getLogger(__name__) @component class HuggingFaceAPIGenerator: """ + A Generator component that uses Hugging Face APIs to generate text. + This component can be used to generate text using different Hugging Face APIs: - [Free Serverless Inference API]((https://huggingface.co/inference-api) - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints) diff --git a/haystack/components/generators/openai.py b/haystack/components/generators/openai.py index 966b552bd..d546fda8c 100644 --- a/haystack/components/generators/openai.py +++ b/haystack/components/generators/openai.py @@ -13,6 +13,8 @@ logger = logging.getLogger(__name__) @component class OpenAIGenerator: """ + Text generation component using OpenAI's large language models (LLMs). + Enables text generation using OpenAI's large language models (LLMs). It supports gpt-4 and gpt-3.5-turbo family of models. @@ -258,6 +260,7 @@ class OpenAIGenerator: def _check_finish_reason(self, message: ChatMessage) -> None: """ Check the `finish_reason` returned with the OpenAI completions. + If the `finish_reason` is `length`, log a warning to the user. :param message: diff --git a/haystack/components/generators/utils.py b/haystack/components/generators/utils.py index 17e225f1d..0afb06fe0 100644 --- a/haystack/components/generators/utils.py +++ b/haystack/components/generators/utils.py @@ -7,6 +7,7 @@ from haystack.utils import deserialize_callable, serialize_callable def print_streaming_chunk(chunk: StreamingChunk) -> None: """ Default callback function for streaming responses. + Prints the tokens of the first completion to stdout as soon as they are received """ print(chunk.content, flush=True, end="") @@ -15,6 +16,7 @@ def print_streaming_chunk(chunk: StreamingChunk) -> None: def serialize_callback_handler(streaming_callback: Callable[[StreamingChunk], None]) -> str: """ Serializes the streaming callback handler. + :param streaming_callback: The streaming callback handler function :returns: @@ -26,6 +28,7 @@ def serialize_callback_handler(streaming_callback: Callable[[StreamingChunk], No def deserialize_callback_handler(callback_name: str) -> Optional[Callable[[StreamingChunk], None]]: """ Deserializes the streaming callback handler. + :param callback_name: The full path of the streaming callback handler function :returns: diff --git a/haystack/components/joiners/document_joiner.py b/haystack/components/joiners/document_joiner.py index ee432b9e8..154bfadb6 100644 --- a/haystack/components/joiners/document_joiner.py +++ b/haystack/components/joiners/document_joiner.py @@ -137,6 +137,7 @@ class DocumentJoiner: def _reciprocal_rank_fusion(self, document_lists): """ Merge multiple lists of Documents and assign scores based on reciprocal rank fusion. + The constant k is set to 61 (60 was suggested by the original paper, plus 1 as python lists are 0-based and the paper used 1-based ranking). """ diff --git a/haystack/components/others/multiplexer.py b/haystack/components/others/multiplexer.py index 0569349b4..9eca10cef 100644 --- a/haystack/components/others/multiplexer.py +++ b/haystack/components/others/multiplexer.py @@ -17,6 +17,8 @@ logger = logging.getLogger(__name__) @component(is_greedy=True) class Multiplexer: """ + A component which receives data connections from multiple components and distributes them to multiple components. + `Multiplexer` offers the ability to both receive data connections from multiple other components and to distribute it to various other components, enhancing the functionality of complex data processing pipelines. @@ -125,6 +127,8 @@ class Multiplexer: def run(self, **kwargs): """ + The run method of the `Multiplexer` component. + Multiplexes the input data from the upstream connected components and distributes it to the downstream connected components. diff --git a/haystack/components/preprocessors/document_cleaner.py b/haystack/components/preprocessors/document_cleaner.py index ab3cbe9c5..4d873116f 100644 --- a/haystack/components/preprocessors/document_cleaner.py +++ b/haystack/components/preprocessors/document_cleaner.py @@ -12,6 +12,8 @@ logger = logging.getLogger(__name__) @component class DocumentCleaner: """ + Cleans the text in the documents. + Cleans up text documents by removing extra whitespaces, empty lines, specified substrings, regexes, page headers and footers (in this order). @@ -38,6 +40,8 @@ class DocumentCleaner: remove_regex: Optional[str] = None, ): """ + Initialize the DocumentCleaner. + :param remove_empty_lines: Whether to remove empty lines. :param remove_extra_whitespaces: Whether to remove extra whitespaces. :param remove_repeated_substrings: Whether to remove repeated substrings (headers/footers) from pages. @@ -97,6 +101,7 @@ class DocumentCleaner: def _remove_empty_lines(self, text: str) -> str: """ Remove empty lines and lines that contain nothing but whitespaces from text. + :param text: Text to clean. :returns: The text without empty lines. """ @@ -107,6 +112,7 @@ class DocumentCleaner: def _remove_extra_whitespaces(self, text: str) -> str: """ Remove extra whitespaces from text. + :param text: Text to clean. :returns: The text without extra whitespaces. """ @@ -115,6 +121,7 @@ class DocumentCleaner: def _remove_regex(self, text: str, regex: str) -> str: """ Remove substrings that match the specified regex from the text. + :param text: Text to clean. :param regex: Regex to match and replace substrings by "". :returns: The text without the substrings that match the regex. @@ -124,6 +131,7 @@ class DocumentCleaner: def _remove_substrings(self, text: str, substrings: List[str]) -> str: """ Remove all specified substrings from the text. + :param text: Text to clean. :param substrings: Substrings to remove. :returns: The text without the specified substrings. @@ -135,6 +143,7 @@ class DocumentCleaner: def _remove_repeated_substrings(self, text: str) -> str: """ Remove any substrings from the text that occur repeatedly on every page. For example headers or footers. + Pages in the text need to be separated by form feed character "\f". :param text: Text to clean. :returns: The text without the repeated substrings. @@ -148,6 +157,7 @@ class DocumentCleaner: ) -> str: """ Heuristic to find footers and headers across different pages by searching for the longest common string. + Pages in the text need to be separated by form feed character "\f". For headers, we only search in the first n_chars characters (for footer: last n_chars). Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX", @@ -182,6 +192,7 @@ class DocumentCleaner: def _ngram(self, seq: str, n: int) -> Generator[str, None, None]: """ Return all ngrams of length n from a text sequence. Each ngram consists of n words split by whitespace. + :param seq: The sequence to generate ngrams from. :param n: The length of the ngrams to generate. :returns: A Generator generating all ngrams of length n from the given sequence. @@ -202,6 +213,7 @@ class DocumentCleaner: def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]: """ Generates all possible ngrams from a given sequence of text. + Considering all ngram lengths between the minimum and maximum length. :param seq: The sequence to generate ngrams from. @@ -217,6 +229,7 @@ class DocumentCleaner: def _find_longest_common_ngram(self, sequences: List[str], min_ngram: int = 3, max_ngram: int = 30) -> str: """ Find the longest common ngram across a list of text sequences (e.g. start of pages). + Considering all ngram lengths between the minimum and maximum length. Helpful for finding footers, headers etc. Empty sequences are ignored. diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py index b76bdde66..adea7cc3c 100644 --- a/haystack/components/preprocessors/document_splitter.py +++ b/haystack/components/preprocessors/document_splitter.py @@ -23,6 +23,8 @@ class DocumentSplitter: split_overlap: int = 0, ): """ + Initialize the DocumentSplitter. + :param split_by: The unit by which the document should be split. Choose from "word" for splitting by " ", "sentence" for splitting by ".", "page" for splitting by "\\f" or "passage" for splitting by "\\n\\n". :param split_length: The maximum number of units in each split. @@ -42,6 +44,8 @@ class DocumentSplitter: @component.output_types(documents=List[Document]) def run(self, documents: List[Document]): """ + Split documents into smaller parts. + Splits documents by the unit expressed in `split_by`, with a length of `split_length` and an overlap of `split_overlap`. diff --git a/haystack/components/preprocessors/text_cleaner.py b/haystack/components/preprocessors/text_cleaner.py index 43d5009e0..3155abcf4 100644 --- a/haystack/components/preprocessors/text_cleaner.py +++ b/haystack/components/preprocessors/text_cleaner.py @@ -8,10 +8,12 @@ from haystack import component @component class TextCleaner: """ - A preprocessor component to clean text data. It can remove substrings matching a list of regular expressions, - convert text to lowercase, remove punctuation, and remove numbers. + A PreProcessor component to clean text data. - This is useful to cleanup text data before evaluation. + It can remove substrings matching a list of regular expressions, convert text to lowercase, remove punctuation, + and remove numbers. + + This is useful to clean up text data before evaluation. """ def __init__( @@ -22,6 +24,8 @@ class TextCleaner: remove_numbers: bool = False, ): """ + Initialize the TextCleaner component. + :param remove_regexps: A list of regular expressions. If provided, it removes substrings matching these regular expressions from the text. :param convert_to_lowercase: If True, converts all characters to lowercase. diff --git a/haystack/components/rankers/lost_in_the_middle.py b/haystack/components/rankers/lost_in_the_middle.py index 292ec7788..1f45045a5 100644 --- a/haystack/components/rankers/lost_in_the_middle.py +++ b/haystack/components/rankers/lost_in_the_middle.py @@ -6,6 +6,8 @@ from haystack import Document, component @component class LostInTheMiddleRanker: """ + A LostInTheMiddle Ranker. + Ranks documents based on the 'lost in the middle' order so that the most relevant documents are either at the beginning or end, while the least relevant are in the middle. @@ -33,6 +35,8 @@ class LostInTheMiddleRanker: def __init__(self, word_count_threshold: Optional[int] = None, top_k: Optional[int] = None): """ + Initialize the LostInTheMiddleRanker. + If 'word_count_threshold' is specified, this ranker includes all documents up until the point where adding another document would exceed the 'word_count_threshold'. The last document that causes the threshold to be breached will be included in the resulting list of documents, but all subsequent documents will be diff --git a/haystack/components/rankers/meta_field.py b/haystack/components/rankers/meta_field.py index d1ba40ba6..ad2ca1da5 100644 --- a/haystack/components/rankers/meta_field.py +++ b/haystack/components/rankers/meta_field.py @@ -141,6 +141,7 @@ class MetaFieldRanker: ): """ Ranks a list of Documents based on the selected meta field by: + 1. Sorting the Documents by the meta field in descending or ascending order. 2. Merging the rankings from the previous component and based on the meta field according to ranking mode and weight. @@ -337,8 +338,10 @@ class MetaFieldRanker: @staticmethod def _calculate_rrf(rank: int, k: int = 61) -> float: """ - Calculates the reciprocal rank fusion. The constant K is set to 61 (60 was suggested by the original paper, - plus 1 as python lists are 0-based and the [paper](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) used 1-based ranking). + Calculates the reciprocal rank fusion. + + The constant K is set to 61 (60 was suggested by the original paper, plus 1 as python lists are 0-based and + the [paper](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) used 1-based ranking). """ return 1 / (k + rank) @@ -346,6 +349,7 @@ class MetaFieldRanker: def _calc_linear_score(rank: int, amount: int) -> float: """ Calculate the meta field score as a linear score between the greatest and the lowest score in the list. + This linear scaling is useful for: - Reducing the effect of outliers - Creating scores that are meaningfully distributed in the range [0,1], diff --git a/haystack/components/rankers/sentence_transformers_diversity.py b/haystack/components/rankers/sentence_transformers_diversity.py index 86915eb65..0e68be2dd 100644 --- a/haystack/components/rankers/sentence_transformers_diversity.py +++ b/haystack/components/rankers/sentence_transformers_diversity.py @@ -15,6 +15,8 @@ with LazyImport(message="Run 'pip install \"sentence-transformers>=2.2.0\"'") as @component class SentenceTransformersDiversityRanker: """ + A Diversity Ranker based on Sentence Transformers. + Implements a document ranking algorithm that orders documents in such a way as to maximize the overall diversity of the documents. diff --git a/haystack/components/readers/extractive.py b/haystack/components/readers/extractive.py index b7aacb9ad..1591bc91d 100644 --- a/haystack/components/readers/extractive.py +++ b/haystack/components/readers/extractive.py @@ -455,6 +455,8 @@ class ExtractiveReader: self, answers: List[ExtractedAnswer], overlap_threshold: Optional[float] ) -> List[ExtractedAnswer]: """ + De-duplicates overlapping Extractive Answers. + De-duplicates overlapping Extractive Answers from the same document based on how much the spans of the answers overlap. diff --git a/haystack/components/routers/conditional_router.py b/haystack/components/routers/conditional_router.py index 9760eed9a..a58ebca65 100644 --- a/haystack/components/routers/conditional_router.py +++ b/haystack/components/routers/conditional_router.py @@ -163,6 +163,8 @@ class ConditionalRouter: def run(self, **kwargs): """ + Executes the routing logic. + Executes the routing logic by evaluating the specified boolean condition expressions for each route in the order they are listed. The method directs the flow of data to the output specified in the first route whose `condition` is True. diff --git a/haystack/components/routers/file_type_router.py b/haystack/components/routers/file_type_router.py index 28bed9f38..8c08f4780 100644 --- a/haystack/components/routers/file_type_router.py +++ b/haystack/components/routers/file_type_router.py @@ -13,6 +13,8 @@ logger = logging.getLogger(__name__) @component class FileTypeRouter: """ + Groups a list of data sources by their MIME types. + FileTypeRouter groups a list of data sources (file paths or byte streams) by their MIME types, allowing for flexible routing of files to different components based on their content type. It supports both exact MIME type matching and pattern matching using regular expressions. @@ -50,6 +52,8 @@ class FileTypeRouter: def __init__(self, mime_types: List[str]): """ + Initialize the FileTypeRouter component. + :param mime_types: A list of file mime types to consider when routing files (e.g. `["text/plain", "audio/x-wav", "image/jpeg"]`). """ diff --git a/haystack/components/routers/metadata_router.py b/haystack/components/routers/metadata_router.py index e4a0bdab2..be51594fb 100644 --- a/haystack/components/routers/metadata_router.py +++ b/haystack/components/routers/metadata_router.py @@ -72,6 +72,8 @@ class MetadataRouter: def run(self, documents: List[Document]): """ + Route the documents. + Route the documents to different edges based on their fields content and the rules specified during initialization. If a document does not match any of the rules, it is routed to a connection named "unmatched". diff --git a/haystack/components/routers/text_language_router.py b/haystack/components/routers/text_language_router.py index 67bcb932e..3da512686 100644 --- a/haystack/components/routers/text_language_router.py +++ b/haystack/components/routers/text_language_router.py @@ -44,6 +44,8 @@ class TextLanguageRouter: def __init__(self, languages: Optional[List[str]] = None): """ + Initialize the TextLanguageRouter component. + :param languages: A list of languages in ISO code, each corresponding to a different output connection. For supported languages, see the [`langdetect` documentation](https://github.com/Mimino666/langdetect#languages). If not specified, the default is `["en"]`. @@ -57,6 +59,7 @@ class TextLanguageRouter: def run(self, text: str) -> Dict[str, str]: """ Route the text to one of different output connections based on its language. + If the text does not match any of the languages specified at initialization, it is routed to a connection named "unmatched". diff --git a/haystack/components/routers/zero_shot_text_router.py b/haystack/components/routers/zero_shot_text_router.py index 0f08a8fb4..40ebeecdf 100644 --- a/haystack/components/routers/zero_shot_text_router.py +++ b/haystack/components/routers/zero_shot_text_router.py @@ -21,6 +21,7 @@ with LazyImport(message="Run 'pip install transformers[torch,sentencepiece]'") a class TransformersZeroShotTextRouter: """ Routes a text input onto different output connections depending on which label it has been categorized into. + This is useful for routing queries to different models in a pipeline depending on their categorization. The set of labels to be used for categorization can be specified. @@ -102,6 +103,8 @@ class TransformersZeroShotTextRouter: huggingface_pipeline_kwargs: Optional[Dict[str, Any]] = None, ): """ + Initializes the TransformersZeroShotTextRouter. + :param labels: The set of possible class labels to classify each sequence into. Can be a single label, a string of comma-separated labels, or a list of labels. :param multi_label: Whether or not multiple candidate labels can be true. @@ -187,8 +190,9 @@ class TransformersZeroShotTextRouter: @component.output_types(documents=Dict[str, str]) def run(self, text: str): """ - Run the TransformersZeroShotTextRouter. This method routes the text to one of the different edges based on which label - it has been categorized into. + Run the TransformersZeroShotTextRouter. + + This method routes the text to one of the different edges based on which label it has been categorized into. :param text: A str to route to one of the different edges. :returns: diff --git a/haystack/components/samplers/top_p.py b/haystack/components/samplers/top_p.py index c1cb5b8ce..5b7ce5645 100644 --- a/haystack/components/samplers/top_p.py +++ b/haystack/components/samplers/top_p.py @@ -56,6 +56,7 @@ class TopPSampler: def run(self, documents: List[Document], top_p: Optional[float] = None): """ Filters documents using top-p sampling based on their scores. + If the specified top_p results in no documents being selected (especially in cases of a low top_p value), the method returns the document with the highest similarity score. @@ -113,6 +114,7 @@ class TopPSampler: def _collect_scores(self, documents: List[Document]) -> List[float]: """ Collect the scores from the documents' metadata. + :param documents: List of Documents. :return: List of scores. """ diff --git a/haystack/components/validators/json_schema.py b/haystack/components/validators/json_schema.py index 1fc1d06c6..231015bf4 100644 --- a/haystack/components/validators/json_schema.py +++ b/haystack/components/validators/json_schema.py @@ -77,6 +77,8 @@ class JsonSchemaValidator: def __init__(self, json_schema: Optional[Dict[str, Any]] = None, error_template: Optional[str] = None): """ + Initialize the JsonSchemaValidator component. + :param json_schema: A dictionary representing the [JSON schema](https://json-schema.org/) against which the messages' content is validated. :param error_template: A custom template string for formatting the error message in case of validation failure. @@ -186,8 +188,9 @@ class JsonSchemaValidator: def _recursive_json_to_object(self, data: Any) -> Any: """ - Recursively traverses a data structure (dictionary or list), converting any string values - that are valid JSON objects into dictionary objects, and returns a new data structure. + Convert any string values that are valid JSON objects into dictionary objects. + + Returns a new data structure. :param data: The data structure to be traversed. :return: A new data structure with JSON strings converted to dictionary objects. diff --git a/haystack/components/websearch/searchapi.py b/haystack/components/websearch/searchapi.py index 4f3042743..4cb03cd1b 100644 --- a/haystack/components/websearch/searchapi.py +++ b/haystack/components/websearch/searchapi.py @@ -41,6 +41,8 @@ class SearchApiWebSearch: search_params: Optional[Dict[str, Any]] = None, ): """ + Initialize the SearchApiWebSearch component. + :param api_key: API key for the SearchApi API :param top_k: Number of documents to return. :param allowed_domains: List of domains to limit the search to. diff --git a/haystack/components/websearch/serper_dev.py b/haystack/components/websearch/serper_dev.py index 0942625d6..7cd105fa7 100644 --- a/haystack/components/websearch/serper_dev.py +++ b/haystack/components/websearch/serper_dev.py @@ -44,6 +44,8 @@ class SerperDevWebSearch: search_params: Optional[Dict[str, Any]] = None, ): """ + Initialize the SerperDevWebSearch component. + :param api_key: API key for the Serper API. :param top_k: Number of documents to return. :param allowed_domains: List of domains to limit the search to. diff --git a/haystack/components/writers/document_writer.py b/haystack/components/writers/document_writer.py index fe3435257..a0ed98c3e 100644 --- a/haystack/components/writers/document_writer.py +++ b/haystack/components/writers/document_writer.py @@ -50,6 +50,7 @@ class DocumentWriter: def to_dict(self) -> Dict[str, Any]: """ Serializes the component to a dictionary. + :returns: Dictionary with serialized data. """ diff --git a/haystack/core/component/component.py b/haystack/core/component/component.py index 9e527e301..80c5e2178 100644 --- a/haystack/core/component/component.py +++ b/haystack/core/component/component.py @@ -93,10 +93,10 @@ _COMPONENT_PRE_INIT_CALLBACK: ContextVar[Optional[Callable]] = ContextVar("compo @contextmanager def _hook_component_init(callback: Callable): """ - Context manager to set a callback that will be invoked - before a component's constructor is called. The callback - receives the component class and the init parameters (as keyword - arguments) and can modify the init parameters in place. + Context manager to set a callback that will be invoked before a component's constructor is called. + + The callback receives the component class and the init parameters (as keyword arguments) and can modify the init + parameters in place. :param callback: Callback function to invoke. @@ -165,8 +165,7 @@ class ComponentMeta(type): def __call__(cls, *args, **kwargs): """ - This method is called when clients instantiate a Component and - runs before __new__ and __init__. + This method is called when clients instantiate a Component and runs before __new__ and __init__. """ # This will call __new__ then __init__, giving us back the Component instance pre_init_hook = _COMPONENT_PRE_INIT_CALLBACK.get() @@ -234,6 +233,7 @@ class ComponentMeta(type): def _component_repr(component: Component) -> str: """ All Components override their __repr__ method with this one. + It prints the component name and the input/output sockets. """ result = object.__repr__(component) @@ -325,8 +325,7 @@ class _Component: def set_output_types(self, instance, **types): """ - Method that specifies the output types when the 'run' method is not decorated - with 'component.output_types'. + Method that specifies the output types when the 'run' method is not decorated with 'component.output_types'. Use as: @@ -364,6 +363,8 @@ class _Component: def output_types_decorator(run_method): """ + Decorator that sets the output types of the decorated method. + This happens at class creation time, and since we don't have the decorated class available here, we temporarily store the output types as an attribute of the decorated method. The ComponentMeta metaclass will use this data to create @@ -390,9 +391,9 @@ class _Component: def copy_class_namespace(namespace): """ - This is the callback that `typing.new_class` will use - to populate the newly created class. We just copy - the whole namespace from the decorated class. + This is the callback that `typing.new_class` will use to populate the newly created class. + + Simply copy the whole namespace from the decorated class. """ for key, val in dict(cls.__dict__).items(): # __dict__ and __weakref__ are class-bound, we should let Python recreate them. diff --git a/haystack/core/pipeline/draw.py b/haystack/core/pipeline/draw.py index ea920dbc1..c122857a9 100644 --- a/haystack/core/pipeline/draw.py +++ b/haystack/core/pipeline/draw.py @@ -102,8 +102,8 @@ def _to_mermaid_text(graph: networkx.MultiDiGraph) -> str: """ Converts a Networkx graph into Mermaid syntax. - The output of this function can be used in the documentation with `mermaid` codeblocks, and it will - be automatically rendered. + The output of this function can be used in the documentation with `mermaid` codeblocks and will be + automatically rendered. """ # Copy the graph to avoid modifying the original graph = _prepare_for_drawing(graph.copy()) diff --git a/haystack/core/pipeline/pipeline.py b/haystack/core/pipeline/pipeline.py index ce7cd5616..cae99da2f 100644 --- a/haystack/core/pipeline/pipeline.py +++ b/haystack/core/pipeline/pipeline.py @@ -241,10 +241,11 @@ class Pipeline: callbacks: Optional[DeserializationCallbacks] = None, ) -> "Pipeline": """ - Creates a `Pipeline` object from the string representation read from the file-like object passed in the `fp` argument. + Creates a `Pipeline` object a string representation. + + The string representation is read from the file-like object passed in the `fp` argument. + - :param data: - The string representation of the pipeline, can be `str`, `bytes` or `bytearray`. :param fp: A file-like object ready to be read from. :param marshaller: @@ -312,7 +313,7 @@ class Pipeline: Connects two components together. All components to connect must exist in the pipeline. - If connecting to an component that has several output connections, specify the inputs and output names as + If connecting to a component that has several output connections, specify the inputs and output names as 'component_name.connections_name'. :param sender: @@ -598,6 +599,8 @@ class Pipeline: def _validate_input(self, data: Dict[str, Any]): """ + Validates pipeline input data. + Validates that data: * Each Component name actually exists in the Pipeline * Each Component is not missing any input @@ -1047,6 +1050,8 @@ class Pipeline: def _prepare_component_input_data(self, data: Dict[str, Any]) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Any]]: """ + Prepares input data for pipeline components. + Organizes input data for pipeline components and identifies any inputs that are not matched to any component's input slots. diff --git a/haystack/core/pipeline/template.py b/haystack/core/pipeline/template.py index 49141eb72..f5d71f937 100644 --- a/haystack/core/pipeline/template.py +++ b/haystack/core/pipeline/template.py @@ -22,9 +22,10 @@ class PredefinedPipeline(Enum): class PipelineTemplate: """ - The PipelineTemplate class enables the straightforward creation of flexible and configurable pipelines using Jinja2 templated YAML files. + The PipelineTemplate enables the creation of flexible and configurable pipelines. - Specifically designed to simplify the setup of complex data processing pipelines for + The PipelineTemplate class enables the straightforward creation of flexible and configurable pipelines using + Jinja2 templated YAML files. Specifically designed to simplify the setup of complex data processing pipelines for a range of NLP tasks—including question answering, retriever augmented generation (RAG), document indexing, among others - PipelineTemplate empowers users to dynamically generate pipeline configurations from templates and customize components as necessary. Its design philosophy centers on providing an accessible, yet powerful, tool @@ -63,9 +64,9 @@ class PipelineTemplate: """ Initialize a PipelineTemplate. - Besides calling the constructor directly, a set of utility methods is provided - for conveniently create an instance of `PipelineTemplate` from different sources. See `from_string`, - `from_file`, `from_predefined` and `from_url`. + Besides calling the constructor directly, a set of utility methods is provided to conveniently create an + instance of `PipelineTemplate` from different sources. See `from_string`, `from_file`, `from_predefined` + and `from_url`. :param template_content: The raw template source to use in the template. """ @@ -106,7 +107,9 @@ class PipelineTemplate: @classmethod def from_predefined(cls, predefined_pipeline: PredefinedPipeline) -> "PipelineTemplate": """ - Create a PipelineTemplate from a predefined template. See `PredefinedPipeline` for available options. + Create a PipelineTemplate from a predefined template. + + See `PredefinedPipeline` for available options. :param predefined_pipeline: The predefined pipeline to use. :returns: An instance of `PipelineTemplate `. diff --git a/haystack/core/serialization.py b/haystack/core/serialization.py index df521564e..52e9efa79 100644 --- a/haystack/core/serialization.py +++ b/haystack/core/serialization.py @@ -123,8 +123,7 @@ def default_to_dict(obj: Any, **init_parameters) -> Dict[str, Any]: """ Utility function to serialize an object to a dictionary. - This is mostly necessary for Components, but it can be used by any object. - + This is mostly necessary for components but can be used by any object. `init_parameters` are parameters passed to the object class `__init__`. They must be defined explicitly as they'll be used when creating a new instance of `obj` with `from_dict`. Omitting them might cause deserialisation @@ -165,7 +164,7 @@ def default_from_dict(cls: Type[object], data: Dict[str, Any]) -> Any: """ Utility function to deserialize a dictionary to an object. - This is mostly necessary for Components but, it can be used by any object. + This is mostly necessary for components but can be used by any object. The function will raise a `DeserializationError` if the `type` field in `data` is missing or it doesn't match the type of `cls`. diff --git a/haystack/dataclasses/sparse_embedding.py b/haystack/dataclasses/sparse_embedding.py index 191f98dbc..5fbfc2bbd 100644 --- a/haystack/dataclasses/sparse_embedding.py +++ b/haystack/dataclasses/sparse_embedding.py @@ -8,7 +8,7 @@ class SparseEmbedding: def __init__(self, indices: List[int], values: List[float]): """ - Initialize a sparse embedding. + Initialize a SparseEmbedding object. :param indices: List of indices of non-zero elements in the embedding. :param values: List of values of non-zero elements in the embedding. @@ -22,7 +22,7 @@ class SparseEmbedding: def to_dict(self): """ - Convert the sparse embedding to a dictionary. + Convert the SparseEmbedding object to a dictionary. :returns: Serialized sparse embedding. @@ -32,7 +32,7 @@ class SparseEmbedding: @classmethod def from_dict(cls, sparse_embedding_dict): """ - Deserializes the sparse embedding from a dictionary. + Deserializes the sparse embedding from a dictionary. :param sparse_embedding_dict: Dictionary to deserialize from.