mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-12 15:27:06 +00:00
docs: fixing all D205 docstring issues (#7577)
* fixing all D205 issues * Update haystack/components/embedders/hugging_face_api_document_embedder.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * Update haystack/components/embedders/hugging_face_api_text_embedder.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * Update haystack/components/generators/chat/hugging_face_api.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * Update haystack/components/generators/chat/hugging_face_local.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * Update haystack/components/generators/hugging_face_api.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * fixing 205 issues and attending PR comments * fixing 205 issues and attending PR comments * Update haystack/components/converters/azure.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/converters/azure.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/extractors/named_entity_extractor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/extractors/named_entity_extractor.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/core/component/component.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/evaluators/answer_exact_match.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/core/pipeline/template.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/core/serialization.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/core/serialization.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/core/pipeline/draw.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/components/generators/azure.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Apply suggestions from code review Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> --------- Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
This commit is contained in:
parent
081757c6b9
commit
201db5b288
@ -10,6 +10,7 @@ logger = logging.getLogger(__name__)
|
||||
class AnswerBuilder:
|
||||
"""
|
||||
Takes a query and the replies a Generator returns as input and parses them into GeneratedAnswer objects.
|
||||
|
||||
Optionally, it also takes Documents and metadata from the Generator as inputs to enrich the GeneratedAnswer objects.
|
||||
|
||||
Usage example:
|
||||
@ -126,9 +127,10 @@ class AnswerBuilder:
|
||||
def _extract_answer_string(reply: str, pattern: Optional[str] = None) -> str:
|
||||
"""
|
||||
Extract the answer string from the generator output using the specified pattern.
|
||||
|
||||
If no pattern is specified, the whole string is used as the answer.
|
||||
|
||||
:param replies:
|
||||
:param reply:
|
||||
The output of the Generator. A string.
|
||||
:param pattern:
|
||||
The regular expression pattern to use to extract the answer text from the generator output.
|
||||
|
||||
@ -11,10 +11,12 @@ logger = logging.getLogger(__name__)
|
||||
@component
|
||||
class DynamicChatPromptBuilder:
|
||||
"""
|
||||
DynamicChatPromptBuilder is designed to construct dynamic prompts from a list of `ChatMessage` instances. It
|
||||
integrates with Jinja2 templating for dynamic prompt generation. It considers any user or system message in the list
|
||||
potentially containing a template and renders it with variables provided to the constructor. Additional template
|
||||
variables can be feed into the component/pipeline `run` method and will be merged before rendering the template.
|
||||
DynamicChatPromptBuilder is designed to construct dynamic prompts from a list of `ChatMessage` instances.
|
||||
|
||||
It integrates with Jinja2 templating for dynamic prompt generation. It considers any user or system message in the
|
||||
list potentially containing a template and renders it with variables provided to the constructor. Additional
|
||||
template variables can be feed into the component/pipeline `run` method and will be merged before rendering the
|
||||
template.
|
||||
|
||||
Usage example:
|
||||
```python
|
||||
@ -92,6 +94,7 @@ class DynamicChatPromptBuilder:
|
||||
def run(self, prompt_source: List[ChatMessage], template_variables: Optional[Dict[str, Any]] = None, **kwargs):
|
||||
"""
|
||||
Executes the dynamic prompt building process by processing a list of `ChatMessage` instances.
|
||||
|
||||
Any user message or system message is inspected for templates and rendered with the variables provided to the
|
||||
constructor. You can provide additional template variables directly to this method, which are then merged with
|
||||
the variables provided to the constructor.
|
||||
@ -151,6 +154,7 @@ class DynamicChatPromptBuilder:
|
||||
def _validate_template(self, template_text: str, provided_variables: Set[str]):
|
||||
"""
|
||||
Checks if all the required template variables are provided to the pipeline `run` method.
|
||||
|
||||
If all the required template variables are provided, returns a Jinja2 template object.
|
||||
Otherwise, raises a ValueError.
|
||||
|
||||
|
||||
@ -10,8 +10,10 @@ logger = logging.getLogger(__name__)
|
||||
@component
|
||||
class DynamicPromptBuilder:
|
||||
"""
|
||||
DynamicPromptBuilder is designed to construct dynamic prompts for the pipeline. Users can change the prompt
|
||||
template at runtime by providing a new template for each pipeline run invocation if needed.
|
||||
DynamicPromptBuilder is designed to construct dynamic prompts for the pipeline.
|
||||
|
||||
Users can change the prompt template at runtime by providing a new template for each pipeline run invocation
|
||||
if needed.
|
||||
|
||||
Usage example:
|
||||
```python
|
||||
@ -92,12 +94,15 @@ class DynamicPromptBuilder:
|
||||
|
||||
def run(self, prompt_source: str, template_variables: Optional[Dict[str, Any]] = None, **kwargs):
|
||||
"""
|
||||
Executes the dynamic prompt building process. Depending on the provided type of `prompt_source`, this method
|
||||
either processes a list of `ChatMessage` instances or a string template. In the case of `ChatMessage` instances,
|
||||
the last user message is treated as a template and rendered with the resolved pipeline variables and any
|
||||
additional template variables provided. For a string template, it directly applies the template variables to
|
||||
render the final prompt. You can provide additional template variables directly to this method, that are then
|
||||
merged with the variables resolved from the pipeline runtime.
|
||||
Executes the dynamic prompt building process.
|
||||
|
||||
Depending on the provided type of `prompt_source`, this method either processes a list of `ChatMessage`
|
||||
instances or a string template. In the case of `ChatMessage` instances, the last user message is treated as a
|
||||
template and rendered with the resolved pipeline variables and any additional template variables provided.
|
||||
|
||||
For a string template, it directly applies the template variables to render the final prompt. You can provide
|
||||
additional template variables directly to this method, that are then merged with the variables resolved from
|
||||
the pipeline runtime.
|
||||
|
||||
:param prompt_source:
|
||||
A string template.
|
||||
@ -127,6 +132,7 @@ class DynamicPromptBuilder:
|
||||
def _validate_template(self, template_text: str, provided_variables: Set[str]):
|
||||
"""
|
||||
Checks if all the required template variables are provided to the pipeline `run` method.
|
||||
|
||||
If all the required template variables are provided, returns a Jinja2 template object.
|
||||
Otherwise, raises a ValueError.
|
||||
|
||||
|
||||
@ -40,6 +40,8 @@ class PromptBuilder:
|
||||
@component.output_types(prompt=str)
|
||||
def run(self, **kwargs):
|
||||
"""
|
||||
Renders the prompt template with the provided variables.
|
||||
|
||||
:param kwargs:
|
||||
The variables that will be used to render the prompt template.
|
||||
|
||||
|
||||
@ -10,8 +10,7 @@ logger = logging.getLogger(__name__)
|
||||
@component
|
||||
class CacheChecker:
|
||||
"""
|
||||
Checks for the presence of documents in a Document Store based on a specified
|
||||
field in each document's metadata.
|
||||
Checks for the presence of documents in a Document Store based on a specified field in each document's metadata.
|
||||
|
||||
If matching documents are found, they are returned as hits. If not, the items
|
||||
are returned as misses, indicating they are not in the cache.
|
||||
@ -92,8 +91,7 @@ class CacheChecker:
|
||||
@component.output_types(hits=List[Document], misses=List)
|
||||
def run(self, items: List[Any]):
|
||||
"""
|
||||
Checks if any document associated with the specified cache field
|
||||
is already present in the store.
|
||||
Checks if any document associated with the specified cache field is already present in the store.
|
||||
|
||||
:param items:
|
||||
Values to be checked against the cache field.
|
||||
|
||||
@ -50,6 +50,8 @@ class DocumentLanguageClassifier:
|
||||
|
||||
def __init__(self, languages: Optional[List[str]] = None):
|
||||
"""
|
||||
Initialize the DocumentLanguageClassifier.
|
||||
|
||||
:param languages: A list of languages in ISO code, each corresponding to a different output connection.
|
||||
For supported languages, see the [`langdetect` documentation](https://github.com/Mimino666/langdetect#languages).
|
||||
If not specified, the default is ["en"].
|
||||
@ -63,6 +65,7 @@ class DocumentLanguageClassifier:
|
||||
def run(self, documents: List[Document]):
|
||||
"""
|
||||
This method classifies the documents' language and adds it to their metadata.
|
||||
|
||||
If a Document's text does not match any of the languages specified at initialization,
|
||||
the metadata value "unmatched" will be stored.
|
||||
|
||||
|
||||
@ -16,6 +16,8 @@ with LazyImport("Run 'pip install openapi3'") as openapi_imports:
|
||||
@component
|
||||
class OpenAPIServiceConnector:
|
||||
"""
|
||||
A component which connects the Haystack framework to OpenAPI services.
|
||||
|
||||
The `OpenAPIServiceConnector` component connects the Haystack framework to OpenAPI services, enabling it to call
|
||||
operations as defined in the OpenAPI specification of the service.
|
||||
|
||||
@ -77,8 +79,10 @@ class OpenAPIServiceConnector:
|
||||
service_credentials: Optional[Union[dict, str]] = None,
|
||||
) -> Dict[str, List[ChatMessage]]:
|
||||
"""
|
||||
Processes a list of chat messages to invoke a method on an OpenAPI service. It parses the last message in the
|
||||
list, expecting it to contain an OpenAI function calling descriptor (name & parameters) in JSON format.
|
||||
Processes a list of chat messages to invoke a method on an OpenAPI service.
|
||||
|
||||
It parses the last message in the list, expecting it to contain an OpenAI function calling descriptor
|
||||
(name & parameters) in JSON format.
|
||||
|
||||
:param messages: A list of `ChatMessage` objects containing the messages to be processed. The last message
|
||||
should contain the function invocation payload in OpenAI function calling format. See the example in the class
|
||||
@ -148,6 +152,8 @@ class OpenAPIServiceConnector:
|
||||
|
||||
def _authenticate_service(self, openapi_service: OpenAPI, credentials: Optional[Union[dict, str]] = None):
|
||||
"""
|
||||
Authentication with an OpenAPI service.
|
||||
|
||||
Authenticates with the OpenAPI service if required, supporting both single (str) and multiple
|
||||
authentication methods (dict).
|
||||
|
||||
@ -201,8 +207,9 @@ class OpenAPIServiceConnector:
|
||||
|
||||
def _invoke_method(self, openapi_service: OpenAPI, method_invocation_descriptor: Dict[str, Any]) -> Any:
|
||||
"""
|
||||
Invokes the specified method on the OpenAPI service. The method name and arguments are passed in the
|
||||
method_invocation_descriptor.
|
||||
Invokes the specified method on the OpenAPI service.
|
||||
|
||||
The method name and arguments are passed in the method_invocation_descriptor.
|
||||
|
||||
:param openapi_service: The OpenAPI service instance.
|
||||
:param method_invocation_descriptor: The method name and arguments to be passed to the method. The payload
|
||||
|
||||
@ -23,7 +23,8 @@ with LazyImport(message="Run 'pip install \"azure-ai-formrecognizer>=3.2.0b2\"'"
|
||||
@component
|
||||
class AzureOCRDocumentConverter:
|
||||
"""
|
||||
A component for converting files to Documents using Azure's Document Intelligence service.
|
||||
Convert files to documents using Azure's Document Intelligence service.
|
||||
|
||||
Supported file formats are: PDF, JPEG, PNG, BMP, TIFF, DOCX, XLSX, PPTX, and HTML.
|
||||
|
||||
In order to be able to use this component, you need an active Azure account
|
||||
@ -170,6 +171,8 @@ class AzureOCRDocumentConverter:
|
||||
# pylint: disable=line-too-long
|
||||
def _convert_tables_and_text(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]]) -> List[Document]:
|
||||
"""
|
||||
Converts the tables and text extracted by Azure's Document Intelligence service into Haystack Documents.
|
||||
|
||||
:param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result
|
||||
can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult).
|
||||
:param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
|
||||
@ -188,6 +191,7 @@ class AzureOCRDocumentConverter:
|
||||
def _convert_tables(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]]) -> List[Document]:
|
||||
"""
|
||||
Converts the tables extracted by Azure's Document Intelligence service into Haystack Documents.
|
||||
|
||||
:param result: The AnalyzeResult Azure object
|
||||
:param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
|
||||
|
||||
@ -296,8 +300,10 @@ class AzureOCRDocumentConverter:
|
||||
|
||||
def _convert_to_natural_text(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]]) -> Document:
|
||||
"""
|
||||
This converts the `AnalyzeResult` object into a single Document. We add "\f" separators between to
|
||||
differentiate between the text on separate pages. This is the expected format for the PreProcessor.
|
||||
This converts the `AnalyzeResult` object into a single document.
|
||||
|
||||
We add "\f" separators between to differentiate between the text on separate pages. This is the expected format
|
||||
for the PreProcessor.
|
||||
|
||||
:param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result
|
||||
can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult).
|
||||
@ -340,8 +346,10 @@ class AzureOCRDocumentConverter:
|
||||
self, result: "AnalyzeResult", meta: Optional[Dict[str, str]], threshold_y: float = 0.05
|
||||
) -> Document:
|
||||
"""
|
||||
This converts the `AnalyzeResult` object into a single Haystack Document. We add "\f" separators between to
|
||||
differentiate between the text on separate pages. This is the expected format for the PreProcessor.
|
||||
This converts the `AnalyzeResult` object into a single Haystack Document.
|
||||
|
||||
We add "\f" separators between to differentiate between the text on separate pages. This is the expected format
|
||||
for the PreProcessor.
|
||||
|
||||
:param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result
|
||||
can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult).
|
||||
@ -427,6 +435,7 @@ class AzureOCRDocumentConverter:
|
||||
def _collect_table_spans(self, result: "AnalyzeResult") -> Dict:
|
||||
"""
|
||||
Collect the spans of all tables by page number.
|
||||
|
||||
:param result: The AnalyzeResult object returned by the `begin_analyze_document` method.
|
||||
:returns: A dictionary with the page number as key and a list of table spans as value.
|
||||
"""
|
||||
@ -443,6 +452,7 @@ class AzureOCRDocumentConverter:
|
||||
) -> bool:
|
||||
"""
|
||||
Check if a line or paragraph is part of a table.
|
||||
|
||||
:param tables_on_page: A dictionary with the page number as key and a list of table spans as value.
|
||||
:param line_or_paragraph: The line or paragraph to check.
|
||||
:returns: True if the line or paragraph is part of a table, False otherwise.
|
||||
@ -457,7 +467,9 @@ class AzureOCRDocumentConverter:
|
||||
|
||||
def _hash_dataframe(self, df: pd.DataFrame, desired_samples=5, hash_length=4) -> str:
|
||||
"""
|
||||
Returns a hash of the DataFrame content. The hash is based on the content of the DataFrame.
|
||||
Returns a hash of the DataFrame content.
|
||||
|
||||
The hash is based on the content of the DataFrame.
|
||||
:param df: The DataFrame to hash.
|
||||
:param desired_samples: The desired number of samples to hash.
|
||||
:param hash_length: The length of the hash for each sample.
|
||||
|
||||
@ -112,6 +112,8 @@ class OpenAPIServiceToFunctions:
|
||||
|
||||
def _openapi_to_functions(self, service_openapi_spec: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
OpenAPI to OpenAI function conversion.
|
||||
|
||||
Extracts functions from the OpenAPI specification of the service and converts them into a format
|
||||
suitable for OpenAI function calling.
|
||||
|
||||
@ -188,6 +190,8 @@ class OpenAPIServiceToFunctions:
|
||||
self, property_schema: Dict[str, Any], include_attributes: Optional[List[str]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Parses the attributes of a property schema.
|
||||
|
||||
Recursively parses the attributes of a property schema, including nested objects and arrays,
|
||||
and includes specified attributes like description, pattern, etc.
|
||||
|
||||
|
||||
@ -7,6 +7,7 @@ from haystack.dataclasses import ByteStream
|
||||
def get_bytestream_from_source(source: Union[str, Path, ByteStream]) -> ByteStream:
|
||||
"""
|
||||
Creates a ByteStream object from a source.
|
||||
|
||||
:param source: A source to convert to a ByteStream. Can be a string (path to a file), a Path object, or a ByteStream.
|
||||
:return: A ByteStream object.
|
||||
"""
|
||||
@ -24,6 +25,8 @@ def normalize_metadata(
|
||||
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], sources_count: int
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Normalize the metadata input for a converter.
|
||||
|
||||
Given all the possible value of the meta input for a converter (None, dictionary or list of dicts),
|
||||
makes sure to return a list of dictionaries of the correct length for the converter to use.
|
||||
|
||||
|
||||
@ -19,6 +19,8 @@ logger = logging.getLogger(__name__)
|
||||
@component
|
||||
class HuggingFaceAPIDocumentEmbedder:
|
||||
"""
|
||||
A component that embeds documents using Hugging Face APIs.
|
||||
|
||||
This component can be used to compute Document embeddings using different Hugging Face APIs:
|
||||
- [Free Serverless Inference API]((https://huggingface.co/inference-api)
|
||||
- [Paid Inference Endpoints](https://huggingface.co/inference-endpoints)
|
||||
|
||||
@ -16,6 +16,8 @@ logger = logging.getLogger(__name__)
|
||||
@component
|
||||
class HuggingFaceAPITextEmbedder:
|
||||
"""
|
||||
A component that embeds text using Hugging Face APIs.
|
||||
|
||||
This component can be used to embed strings using different Hugging Face APIs:
|
||||
- [Free Serverless Inference API]((https://huggingface.co/inference-api)
|
||||
- [Paid Inference Endpoints](https://huggingface.co/inference-endpoints)
|
||||
|
||||
@ -6,11 +6,13 @@ from haystack.core.component import component
|
||||
@component
|
||||
class AnswerExactMatchEvaluator:
|
||||
"""
|
||||
Evaluator that checks if predicted answers exactly match ground truth answers.
|
||||
An answer exact match evaluator class.
|
||||
|
||||
The evaluator that checks if the predicted answers matches any of the ground truth answers exactly.
|
||||
The result is a number from 0.0 to 1.0, it represents the proportion of predicted answers
|
||||
that matched one of the ground truth answers.
|
||||
There can be multiple ground truth answers and multiple predicted answers as input.
|
||||
|
||||
Each predicted answer is compared to one ground truth answer.
|
||||
The final score is a number ranging from 0.0 to 1.0.
|
||||
It represents the proportion of predicted answers that match their corresponding ground truth answer.
|
||||
|
||||
Usage example:
|
||||
```python
|
||||
@ -33,7 +35,8 @@ class AnswerExactMatchEvaluator:
|
||||
def run(self, ground_truth_answers: List[str], predicted_answers: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Run the AnswerExactMatchEvaluator on the given inputs.
|
||||
`ground_truth_answers` and `retrieved_answers` must have the same length.
|
||||
|
||||
The `ground_truth_answers` and `retrieved_answers` must have the same length.
|
||||
|
||||
:param ground_truth_answers:
|
||||
A list of expected answers.
|
||||
|
||||
@ -6,6 +6,8 @@ from haystack import Document, component
|
||||
@component
|
||||
class DocumentMAPEvaluator:
|
||||
"""
|
||||
A Mean Average Precision (MAP) evaluator for documents.
|
||||
|
||||
Evaluator that calculates the mean average precision of the retrieved documents, a metric
|
||||
that measures how high retrieved documents are ranked.
|
||||
Each question can have multiple ground truth documents and multiple retrieved documents.
|
||||
@ -43,6 +45,7 @@ class DocumentMAPEvaluator:
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Run the DocumentMAPEvaluator on the given inputs.
|
||||
|
||||
All lists must have the same length.
|
||||
|
||||
:param ground_truth_documents:
|
||||
@ -52,7 +55,7 @@ class DocumentMAPEvaluator:
|
||||
:returns:
|
||||
A dictionary with the following outputs:
|
||||
- `score` - The average of calculated scores.
|
||||
- `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents how high retrieved documents are ranked.
|
||||
- `individual_scores` - A list of numbers from 0.0 to 1.0 that represents how high retrieved documents are ranked.
|
||||
"""
|
||||
if len(ground_truth_documents) != len(retrieved_documents):
|
||||
msg = "The length of ground_truth_documents and retrieved_documents must be the same."
|
||||
|
||||
@ -32,6 +32,7 @@ class RecallMode(Enum):
|
||||
class DocumentRecallEvaluator:
|
||||
"""
|
||||
Evaluator that calculates the Recall score for a list of documents.
|
||||
|
||||
Returns both a list of scores for each question and the average.
|
||||
There can be multiple ground truth documents and multiple predicted documents as input.
|
||||
|
||||
@ -91,6 +92,7 @@ class DocumentRecallEvaluator:
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Run the DocumentRecallEvaluator on the given inputs.
|
||||
|
||||
`ground_truth_documents` and `retrieved_documents` must have the same length.
|
||||
|
||||
:param ground_truth_documents:
|
||||
|
||||
@ -178,6 +178,8 @@ class LLMEvaluator:
|
||||
|
||||
def prepare_template(self) -> str:
|
||||
"""
|
||||
Prepare the prompt template.
|
||||
|
||||
Combine instructions, inputs, outputs, and examples into one prompt template with the following format:
|
||||
Instructions:
|
||||
<instructions>
|
||||
|
||||
@ -16,6 +16,7 @@ with LazyImport(message="Run 'pip install scikit-learn \"sentence-transformers>=
|
||||
class SASEvaluator:
|
||||
"""
|
||||
SASEvaluator computes the Semantic Answer Similarity (SAS) between a list of predictions and a list of ground truths.
|
||||
|
||||
It's usually used in Retrieval Augmented Generation (RAG) pipelines to evaluate the quality of the generated answers.
|
||||
|
||||
The SAS is computed using a pre-trained model from the Hugging Face model hub. The model can be either a
|
||||
@ -132,6 +133,8 @@ class SASEvaluator:
|
||||
@component.output_types(score=float, individual_scores=List[float])
|
||||
def run(self, ground_truth_answers: List[str], predicted_answers: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
SASEvaluator component run method.
|
||||
|
||||
Run the SASEvaluator to compute the Semantic Answer Similarity (SAS) between a list of predicted answers
|
||||
and a list of ground truth answers. Both must be list of strings of same length.
|
||||
|
||||
|
||||
@ -159,8 +159,7 @@ class NamedEntityExtractor:
|
||||
@component.output_types(documents=List[Document])
|
||||
def run(self, documents: List[Document], batch_size: int = 1) -> Dict[str, Any]:
|
||||
"""
|
||||
Annotate named entities in each document and store
|
||||
the annotations in the document's metadata.
|
||||
Annotate named entities in each document and store the annotations in the document's metadata.
|
||||
|
||||
:param documents:
|
||||
Documents to process.
|
||||
@ -227,8 +226,7 @@ class NamedEntityExtractor:
|
||||
@classmethod
|
||||
def get_stored_annotations(cls, document: Document) -> Optional[List[NamedEntityAnnotation]]:
|
||||
"""
|
||||
Returns the document's named entity annotations stored
|
||||
in its metadata, if any.
|
||||
Returns the document's named entity annotations stored in its metadata, if any.
|
||||
|
||||
:param document:
|
||||
Document whose annotations are to be fetched.
|
||||
@ -259,16 +257,14 @@ class _NerBackend(ABC):
|
||||
@abstractmethod
|
||||
def initialize(self):
|
||||
"""
|
||||
Initializes the backend. This would usually
|
||||
entail loading models, pipelines, etc.
|
||||
Initializes the backend. This would usually entail loading models, pipelines, and so on.
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def initialized(self) -> bool:
|
||||
"""
|
||||
Returns if the backend has been initialized, i.e,
|
||||
ready to annotate text.
|
||||
Returns if the backend has been initialized, for example, ready to annotate text.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
@ -295,6 +291,8 @@ class _NerBackend(ABC):
|
||||
@property
|
||||
def device(self) -> ComponentDevice:
|
||||
"""
|
||||
The device on which the backend's model is loaded.
|
||||
|
||||
:returns:
|
||||
The device on which the backend's model is loaded.
|
||||
"""
|
||||
@ -457,8 +455,7 @@ class _SpacyBackend(_NerBackend):
|
||||
@contextmanager
|
||||
def _select_device(self):
|
||||
"""
|
||||
Context manager used to run spaCy models on a specific
|
||||
GPU in a scoped manner.
|
||||
Context manager used to run spaCy models on a specific GPU in a scoped manner.
|
||||
"""
|
||||
|
||||
# TODO: This won't restore the active device.
|
||||
|
||||
@ -26,6 +26,8 @@ REQUEST_HEADERS = {
|
||||
|
||||
def _text_content_handler(response: Response) -> ByteStream:
|
||||
"""
|
||||
Handles text content.
|
||||
|
||||
:param response: Response object from the request.
|
||||
:return: The extracted text.
|
||||
"""
|
||||
@ -34,6 +36,8 @@ def _text_content_handler(response: Response) -> ByteStream:
|
||||
|
||||
def _binary_content_handler(response: Response) -> ByteStream:
|
||||
"""
|
||||
Handles binary content.
|
||||
|
||||
:param response: Response object from the request.
|
||||
:return: The extracted binary file-like object.
|
||||
"""
|
||||
@ -211,6 +215,7 @@ class LinkContentFetcher:
|
||||
def _switch_user_agent(self, retry_state: RetryCallState) -> None:
|
||||
"""
|
||||
Switches the User-Agent for this LinkContentRetriever to the next one in the list of user agents.
|
||||
|
||||
Used by tenacity to retry the requests with a different user agent.
|
||||
|
||||
:param retry_state: The retry state (unused, required by tenacity).
|
||||
|
||||
@ -14,8 +14,9 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
class AzureOpenAIGenerator(OpenAIGenerator):
|
||||
"""
|
||||
Enables text generation using OpenAI's large language models (LLMs) on Azure. It supports gpt-4 and gpt-3.5-turbo
|
||||
family of models.
|
||||
A Generator component that uses OpenAI's large language models (LLMs) on Azure to generate text.
|
||||
|
||||
It supports gpt-4 and gpt-3.5-turbo family of models.
|
||||
|
||||
Users can pass any text generation parameters valid for the `openai.ChatCompletion.create` method
|
||||
directly to this component via the `**generation_kwargs` parameter in __init__ or the `**generation_kwargs`
|
||||
@ -59,6 +60,8 @@ class AzureOpenAIGenerator(OpenAIGenerator):
|
||||
generation_kwargs: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the Azure OpenAI Generator.
|
||||
|
||||
:param azure_endpoint: The endpoint of the deployed model, e.g. `https://example-resource.azure.openai.com/`
|
||||
:param api_version: The version of the API to use. Defaults to 2023-05-15
|
||||
:param azure_deployment: The deployment of the model, usually the model name.
|
||||
|
||||
@ -14,6 +14,8 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
class AzureOpenAIChatGenerator(OpenAIChatGenerator):
|
||||
"""
|
||||
A Chat Generator component that uses the Azure OpenAI API to generate text.
|
||||
|
||||
Enables text generation using OpenAI's large language models (LLMs) on Azure. It supports `gpt-4` and `gpt-3.5-turbo`
|
||||
family of models accessed through the chat completions API endpoint.
|
||||
|
||||
@ -76,6 +78,8 @@ class AzureOpenAIChatGenerator(OpenAIChatGenerator):
|
||||
generation_kwargs: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the Azure OpenAI Chat Generator component.
|
||||
|
||||
:param azure_endpoint: The endpoint of the deployed model, e.g. `"https://example-resource.azure.openai.com/"`
|
||||
:param api_version: The version of the API to use. Defaults to 2023-05-15
|
||||
:param azure_deployment: The deployment of the model, usually the model name.
|
||||
|
||||
@ -17,6 +17,8 @@ logger = logging.getLogger(__name__)
|
||||
@component
|
||||
class HuggingFaceAPIChatGenerator:
|
||||
"""
|
||||
A Chat Generator component that uses Hugging Face APIs to generate text.
|
||||
|
||||
This component can be used to generate text using different Hugging Face APIs with the ChatMessage format:
|
||||
- [Free Serverless Inference API](https://huggingface.co/inference-api)
|
||||
- [Paid Inference Endpoints](https://huggingface.co/inference-endpoints)
|
||||
|
||||
@ -32,6 +32,8 @@ PIPELINE_SUPPORTED_TASKS = ["text-generation", "text2text-generation"]
|
||||
@component
|
||||
class HuggingFaceLocalChatGenerator:
|
||||
"""
|
||||
A Chat Generator component that uses models available on Hugging Face Hub to generate chat responses locally.
|
||||
|
||||
The `HuggingFaceLocalChatGenerator` class is a component designed for generating chat responses using models from
|
||||
Hugging Face's model hub. It is tailored for local runtime text generation tasks and provides a convenient interface
|
||||
for working with chat-based models, such as `HuggingFaceH4/zephyr-7b-beta` or `meta-llama/Llama-2-7b-chat-hf`
|
||||
@ -78,6 +80,8 @@ class HuggingFaceLocalChatGenerator:
|
||||
streaming_callback: Optional[Callable[[StreamingChunk], None]] = None,
|
||||
):
|
||||
"""
|
||||
Initializes the HuggingFaceLocalChatGenerator component.
|
||||
|
||||
:param model: The name or path of a Hugging Face model for text generation,
|
||||
for example, `mistralai/Mistral-7B-Instruct-v0.2`, `TheBloke/OpenHermes-2.5-Mistral-7B-16k-AWQ`, etc.
|
||||
The important aspect of the model is that it should be a chat model and that it supports ChatML messaging
|
||||
|
||||
@ -24,6 +24,8 @@ logger = logging.getLogger(__name__)
|
||||
@component
|
||||
class HuggingFaceTGIChatGenerator:
|
||||
"""
|
||||
A Chat-based text generation component using Hugging Face's Text Generation Inference (TGI) framework.
|
||||
|
||||
Enables text generation using HuggingFace Hub hosted chat-based LLMs. This component is designed to seamlessly
|
||||
inference chat-based models deployed on the Text Generation Inference (TGI) backend.
|
||||
|
||||
@ -147,6 +149,8 @@ class HuggingFaceTGIChatGenerator:
|
||||
|
||||
def warm_up(self) -> None:
|
||||
"""
|
||||
Warm up the tokenizer by loading it from the model.
|
||||
|
||||
If the url is not provided, check if the model is deployed on the free tier of the HF inference API.
|
||||
Load the tokenizer
|
||||
"""
|
||||
|
||||
@ -17,6 +17,8 @@ logger = logging.getLogger(__name__)
|
||||
@component
|
||||
class OpenAIChatGenerator:
|
||||
"""
|
||||
A Chat Generator component that uses the OpenAI API to generate text.
|
||||
|
||||
Enables text generation using OpenAI's large language models (LLMs). It supports `gpt-4` and `gpt-3.5-turbo`
|
||||
family of models accessed through the chat completions API endpoint.
|
||||
|
||||
@ -71,6 +73,8 @@ class OpenAIChatGenerator:
|
||||
generation_kwargs: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
"""
|
||||
Initializes the OpenAIChatGenerator component.
|
||||
|
||||
Creates an instance of OpenAIChatGenerator. Unless specified otherwise in the `model`, this is for OpenAI's
|
||||
GPT-3.5 model.
|
||||
|
||||
@ -206,6 +210,7 @@ class OpenAIChatGenerator:
|
||||
def _connect_chunks(self, chunk: Any, chunks: List[StreamingChunk]) -> ChatMessage:
|
||||
"""
|
||||
Connects the streaming chunks into a single ChatMessage.
|
||||
|
||||
:param chunk: The last chunk returned by the OpenAI API.
|
||||
:param chunks: The list of all chunks returned by the OpenAI API.
|
||||
"""
|
||||
@ -256,6 +261,7 @@ class OpenAIChatGenerator:
|
||||
def _build_message(self, completion: ChatCompletion, choice: Choice) -> ChatMessage:
|
||||
"""
|
||||
Converts the non-streaming response from the OpenAI API to a ChatMessage.
|
||||
|
||||
:param completion: The completion returned by the OpenAI API.
|
||||
:param choice: The choice returned by the OpenAI API.
|
||||
:return: The ChatMessage.
|
||||
@ -287,6 +293,7 @@ class OpenAIChatGenerator:
|
||||
def _build_chunk(self, chunk: ChatCompletionChunk) -> StreamingChunk:
|
||||
"""
|
||||
Converts the streaming response chunk from the OpenAI API to a StreamingChunk.
|
||||
|
||||
:param chunk: The chunk returned by the OpenAI API.
|
||||
:param choice: The choice returned by the OpenAI API.
|
||||
:return: The StreamingChunk.
|
||||
@ -311,6 +318,7 @@ class OpenAIChatGenerator:
|
||||
def _check_finish_reason(self, message: ChatMessage) -> None:
|
||||
"""
|
||||
Check the `finish_reason` returned with the OpenAI completions.
|
||||
|
||||
If the `finish_reason` is `length` or `content_filter`, log a warning.
|
||||
:param message: The message returned by the LLM.
|
||||
"""
|
||||
|
||||
@ -23,6 +23,8 @@ logger = logging.getLogger(__name__)
|
||||
@component
|
||||
class HuggingFaceAPIGenerator:
|
||||
"""
|
||||
A Generator component that uses Hugging Face APIs to generate text.
|
||||
|
||||
This component can be used to generate text using different Hugging Face APIs:
|
||||
- [Free Serverless Inference API]((https://huggingface.co/inference-api)
|
||||
- [Paid Inference Endpoints](https://huggingface.co/inference-endpoints)
|
||||
|
||||
@ -13,6 +13,8 @@ logger = logging.getLogger(__name__)
|
||||
@component
|
||||
class OpenAIGenerator:
|
||||
"""
|
||||
Text generation component using OpenAI's large language models (LLMs).
|
||||
|
||||
Enables text generation using OpenAI's large language models (LLMs). It supports gpt-4 and gpt-3.5-turbo
|
||||
family of models.
|
||||
|
||||
@ -258,6 +260,7 @@ class OpenAIGenerator:
|
||||
def _check_finish_reason(self, message: ChatMessage) -> None:
|
||||
"""
|
||||
Check the `finish_reason` returned with the OpenAI completions.
|
||||
|
||||
If the `finish_reason` is `length`, log a warning to the user.
|
||||
|
||||
:param message:
|
||||
|
||||
@ -7,6 +7,7 @@ from haystack.utils import deserialize_callable, serialize_callable
|
||||
def print_streaming_chunk(chunk: StreamingChunk) -> None:
|
||||
"""
|
||||
Default callback function for streaming responses.
|
||||
|
||||
Prints the tokens of the first completion to stdout as soon as they are received
|
||||
"""
|
||||
print(chunk.content, flush=True, end="")
|
||||
@ -15,6 +16,7 @@ def print_streaming_chunk(chunk: StreamingChunk) -> None:
|
||||
def serialize_callback_handler(streaming_callback: Callable[[StreamingChunk], None]) -> str:
|
||||
"""
|
||||
Serializes the streaming callback handler.
|
||||
|
||||
:param streaming_callback:
|
||||
The streaming callback handler function
|
||||
:returns:
|
||||
@ -26,6 +28,7 @@ def serialize_callback_handler(streaming_callback: Callable[[StreamingChunk], No
|
||||
def deserialize_callback_handler(callback_name: str) -> Optional[Callable[[StreamingChunk], None]]:
|
||||
"""
|
||||
Deserializes the streaming callback handler.
|
||||
|
||||
:param callback_name:
|
||||
The full path of the streaming callback handler function
|
||||
:returns:
|
||||
|
||||
@ -137,6 +137,7 @@ class DocumentJoiner:
|
||||
def _reciprocal_rank_fusion(self, document_lists):
|
||||
"""
|
||||
Merge multiple lists of Documents and assign scores based on reciprocal rank fusion.
|
||||
|
||||
The constant k is set to 61 (60 was suggested by the original paper,
|
||||
plus 1 as python lists are 0-based and the paper used 1-based ranking).
|
||||
"""
|
||||
|
||||
@ -17,6 +17,8 @@ logger = logging.getLogger(__name__)
|
||||
@component(is_greedy=True)
|
||||
class Multiplexer:
|
||||
"""
|
||||
A component which receives data connections from multiple components and distributes them to multiple components.
|
||||
|
||||
`Multiplexer` offers the ability to both receive data connections from multiple other
|
||||
components and to distribute it to various other components, enhancing the functionality of complex data
|
||||
processing pipelines.
|
||||
@ -125,6 +127,8 @@ class Multiplexer:
|
||||
|
||||
def run(self, **kwargs):
|
||||
"""
|
||||
The run method of the `Multiplexer` component.
|
||||
|
||||
Multiplexes the input data from the upstream connected components and distributes it to the downstream connected
|
||||
components.
|
||||
|
||||
|
||||
@ -12,6 +12,8 @@ logger = logging.getLogger(__name__)
|
||||
@component
|
||||
class DocumentCleaner:
|
||||
"""
|
||||
Cleans the text in the documents.
|
||||
|
||||
Cleans up text documents by removing extra whitespaces, empty lines, specified substrings, regexes,
|
||||
page headers and footers (in this order).
|
||||
|
||||
@ -38,6 +40,8 @@ class DocumentCleaner:
|
||||
remove_regex: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the DocumentCleaner.
|
||||
|
||||
:param remove_empty_lines: Whether to remove empty lines.
|
||||
:param remove_extra_whitespaces: Whether to remove extra whitespaces.
|
||||
:param remove_repeated_substrings: Whether to remove repeated substrings (headers/footers) from pages.
|
||||
@ -97,6 +101,7 @@ class DocumentCleaner:
|
||||
def _remove_empty_lines(self, text: str) -> str:
|
||||
"""
|
||||
Remove empty lines and lines that contain nothing but whitespaces from text.
|
||||
|
||||
:param text: Text to clean.
|
||||
:returns: The text without empty lines.
|
||||
"""
|
||||
@ -107,6 +112,7 @@ class DocumentCleaner:
|
||||
def _remove_extra_whitespaces(self, text: str) -> str:
|
||||
"""
|
||||
Remove extra whitespaces from text.
|
||||
|
||||
:param text: Text to clean.
|
||||
:returns: The text without extra whitespaces.
|
||||
"""
|
||||
@ -115,6 +121,7 @@ class DocumentCleaner:
|
||||
def _remove_regex(self, text: str, regex: str) -> str:
|
||||
"""
|
||||
Remove substrings that match the specified regex from the text.
|
||||
|
||||
:param text: Text to clean.
|
||||
:param regex: Regex to match and replace substrings by "".
|
||||
:returns: The text without the substrings that match the regex.
|
||||
@ -124,6 +131,7 @@ class DocumentCleaner:
|
||||
def _remove_substrings(self, text: str, substrings: List[str]) -> str:
|
||||
"""
|
||||
Remove all specified substrings from the text.
|
||||
|
||||
:param text: Text to clean.
|
||||
:param substrings: Substrings to remove.
|
||||
:returns: The text without the specified substrings.
|
||||
@ -135,6 +143,7 @@ class DocumentCleaner:
|
||||
def _remove_repeated_substrings(self, text: str) -> str:
|
||||
"""
|
||||
Remove any substrings from the text that occur repeatedly on every page. For example headers or footers.
|
||||
|
||||
Pages in the text need to be separated by form feed character "\f".
|
||||
:param text: Text to clean.
|
||||
:returns: The text without the repeated substrings.
|
||||
@ -148,6 +157,7 @@ class DocumentCleaner:
|
||||
) -> str:
|
||||
"""
|
||||
Heuristic to find footers and headers across different pages by searching for the longest common string.
|
||||
|
||||
Pages in the text need to be separated by form feed character "\f".
|
||||
For headers, we only search in the first n_chars characters (for footer: last n_chars).
|
||||
Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
|
||||
@ -182,6 +192,7 @@ class DocumentCleaner:
|
||||
def _ngram(self, seq: str, n: int) -> Generator[str, None, None]:
|
||||
"""
|
||||
Return all ngrams of length n from a text sequence. Each ngram consists of n words split by whitespace.
|
||||
|
||||
:param seq: The sequence to generate ngrams from.
|
||||
:param n: The length of the ngrams to generate.
|
||||
:returns: A Generator generating all ngrams of length n from the given sequence.
|
||||
@ -202,6 +213,7 @@ class DocumentCleaner:
|
||||
def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
|
||||
"""
|
||||
Generates all possible ngrams from a given sequence of text.
|
||||
|
||||
Considering all ngram lengths between the minimum and maximum length.
|
||||
|
||||
:param seq: The sequence to generate ngrams from.
|
||||
@ -217,6 +229,7 @@ class DocumentCleaner:
|
||||
def _find_longest_common_ngram(self, sequences: List[str], min_ngram: int = 3, max_ngram: int = 30) -> str:
|
||||
"""
|
||||
Find the longest common ngram across a list of text sequences (e.g. start of pages).
|
||||
|
||||
Considering all ngram lengths between the minimum and maximum length. Helpful for finding footers, headers etc.
|
||||
Empty sequences are ignored.
|
||||
|
||||
|
||||
@ -23,6 +23,8 @@ class DocumentSplitter:
|
||||
split_overlap: int = 0,
|
||||
):
|
||||
"""
|
||||
Initialize the DocumentSplitter.
|
||||
|
||||
:param split_by: The unit by which the document should be split. Choose from "word" for splitting by " ",
|
||||
"sentence" for splitting by ".", "page" for splitting by "\\f" or "passage" for splitting by "\\n\\n".
|
||||
:param split_length: The maximum number of units in each split.
|
||||
@ -42,6 +44,8 @@ class DocumentSplitter:
|
||||
@component.output_types(documents=List[Document])
|
||||
def run(self, documents: List[Document]):
|
||||
"""
|
||||
Split documents into smaller parts.
|
||||
|
||||
Splits documents by the unit expressed in `split_by`, with a length of `split_length`
|
||||
and an overlap of `split_overlap`.
|
||||
|
||||
|
||||
@ -8,10 +8,12 @@ from haystack import component
|
||||
@component
|
||||
class TextCleaner:
|
||||
"""
|
||||
A preprocessor component to clean text data. It can remove substrings matching a list of regular expressions,
|
||||
convert text to lowercase, remove punctuation, and remove numbers.
|
||||
A PreProcessor component to clean text data.
|
||||
|
||||
This is useful to cleanup text data before evaluation.
|
||||
It can remove substrings matching a list of regular expressions, convert text to lowercase, remove punctuation,
|
||||
and remove numbers.
|
||||
|
||||
This is useful to clean up text data before evaluation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@ -22,6 +24,8 @@ class TextCleaner:
|
||||
remove_numbers: bool = False,
|
||||
):
|
||||
"""
|
||||
Initialize the TextCleaner component.
|
||||
|
||||
:param remove_regexps: A list of regular expressions. If provided, it removes substrings
|
||||
matching these regular expressions from the text.
|
||||
:param convert_to_lowercase: If True, converts all characters to lowercase.
|
||||
|
||||
@ -6,6 +6,8 @@ from haystack import Document, component
|
||||
@component
|
||||
class LostInTheMiddleRanker:
|
||||
"""
|
||||
A LostInTheMiddle Ranker.
|
||||
|
||||
Ranks documents based on the 'lost in the middle' order so that the most relevant documents are either at the
|
||||
beginning or end, while the least relevant are in the middle.
|
||||
|
||||
@ -33,6 +35,8 @@ class LostInTheMiddleRanker:
|
||||
|
||||
def __init__(self, word_count_threshold: Optional[int] = None, top_k: Optional[int] = None):
|
||||
"""
|
||||
Initialize the LostInTheMiddleRanker.
|
||||
|
||||
If 'word_count_threshold' is specified, this ranker includes all documents up until the point where adding
|
||||
another document would exceed the 'word_count_threshold'. The last document that causes the threshold to
|
||||
be breached will be included in the resulting list of documents, but all subsequent documents will be
|
||||
|
||||
@ -141,6 +141,7 @@ class MetaFieldRanker:
|
||||
):
|
||||
"""
|
||||
Ranks a list of Documents based on the selected meta field by:
|
||||
|
||||
1. Sorting the Documents by the meta field in descending or ascending order.
|
||||
2. Merging the rankings from the previous component and based on the meta field according to ranking mode and
|
||||
weight.
|
||||
@ -337,8 +338,10 @@ class MetaFieldRanker:
|
||||
@staticmethod
|
||||
def _calculate_rrf(rank: int, k: int = 61) -> float:
|
||||
"""
|
||||
Calculates the reciprocal rank fusion. The constant K is set to 61 (60 was suggested by the original paper,
|
||||
plus 1 as python lists are 0-based and the [paper](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) used 1-based ranking).
|
||||
Calculates the reciprocal rank fusion.
|
||||
|
||||
The constant K is set to 61 (60 was suggested by the original paper, plus 1 as python lists are 0-based and
|
||||
the [paper](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) used 1-based ranking).
|
||||
"""
|
||||
return 1 / (k + rank)
|
||||
|
||||
@ -346,6 +349,7 @@ class MetaFieldRanker:
|
||||
def _calc_linear_score(rank: int, amount: int) -> float:
|
||||
"""
|
||||
Calculate the meta field score as a linear score between the greatest and the lowest score in the list.
|
||||
|
||||
This linear scaling is useful for:
|
||||
- Reducing the effect of outliers
|
||||
- Creating scores that are meaningfully distributed in the range [0,1],
|
||||
|
||||
@ -15,6 +15,8 @@ with LazyImport(message="Run 'pip install \"sentence-transformers>=2.2.0\"'") as
|
||||
@component
|
||||
class SentenceTransformersDiversityRanker:
|
||||
"""
|
||||
A Diversity Ranker based on Sentence Transformers.
|
||||
|
||||
Implements a document ranking algorithm that orders documents in such a way as to maximize the overall diversity
|
||||
of the documents.
|
||||
|
||||
|
||||
@ -455,6 +455,8 @@ class ExtractiveReader:
|
||||
self, answers: List[ExtractedAnswer], overlap_threshold: Optional[float]
|
||||
) -> List[ExtractedAnswer]:
|
||||
"""
|
||||
De-duplicates overlapping Extractive Answers.
|
||||
|
||||
De-duplicates overlapping Extractive Answers from the same document based on how much the spans of the
|
||||
answers overlap.
|
||||
|
||||
|
||||
@ -163,6 +163,8 @@ class ConditionalRouter:
|
||||
|
||||
def run(self, **kwargs):
|
||||
"""
|
||||
Executes the routing logic.
|
||||
|
||||
Executes the routing logic by evaluating the specified boolean condition expressions for each route in the order they are listed.
|
||||
The method directs the flow of data to the output specified in the first route whose `condition` is True.
|
||||
|
||||
|
||||
@ -13,6 +13,8 @@ logger = logging.getLogger(__name__)
|
||||
@component
|
||||
class FileTypeRouter:
|
||||
"""
|
||||
Groups a list of data sources by their MIME types.
|
||||
|
||||
FileTypeRouter groups a list of data sources (file paths or byte streams) by their MIME types, allowing
|
||||
for flexible routing of files to different components based on their content type. It supports both exact MIME type
|
||||
matching and pattern matching using regular expressions.
|
||||
@ -50,6 +52,8 @@ class FileTypeRouter:
|
||||
|
||||
def __init__(self, mime_types: List[str]):
|
||||
"""
|
||||
Initialize the FileTypeRouter component.
|
||||
|
||||
:param mime_types: A list of file mime types to consider when routing files
|
||||
(e.g. `["text/plain", "audio/x-wav", "image/jpeg"]`).
|
||||
"""
|
||||
|
||||
@ -72,6 +72,8 @@ class MetadataRouter:
|
||||
|
||||
def run(self, documents: List[Document]):
|
||||
"""
|
||||
Route the documents.
|
||||
|
||||
Route the documents to different edges based on their fields content and the rules specified during initialization.
|
||||
If a document does not match any of the rules, it is routed to a connection named "unmatched".
|
||||
|
||||
|
||||
@ -44,6 +44,8 @@ class TextLanguageRouter:
|
||||
|
||||
def __init__(self, languages: Optional[List[str]] = None):
|
||||
"""
|
||||
Initialize the TextLanguageRouter component.
|
||||
|
||||
:param languages: A list of languages in ISO code, each corresponding to a different output connection.
|
||||
For supported languages, see the [`langdetect` documentation](https://github.com/Mimino666/langdetect#languages).
|
||||
If not specified, the default is `["en"]`.
|
||||
@ -57,6 +59,7 @@ class TextLanguageRouter:
|
||||
def run(self, text: str) -> Dict[str, str]:
|
||||
"""
|
||||
Route the text to one of different output connections based on its language.
|
||||
|
||||
If the text does not match any of the languages specified at initialization, it is routed to
|
||||
a connection named "unmatched".
|
||||
|
||||
|
||||
@ -21,6 +21,7 @@ with LazyImport(message="Run 'pip install transformers[torch,sentencepiece]'") a
|
||||
class TransformersZeroShotTextRouter:
|
||||
"""
|
||||
Routes a text input onto different output connections depending on which label it has been categorized into.
|
||||
|
||||
This is useful for routing queries to different models in a pipeline depending on their categorization.
|
||||
The set of labels to be used for categorization can be specified.
|
||||
|
||||
@ -102,6 +103,8 @@ class TransformersZeroShotTextRouter:
|
||||
huggingface_pipeline_kwargs: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
"""
|
||||
Initializes the TransformersZeroShotTextRouter.
|
||||
|
||||
:param labels: The set of possible class labels to classify each sequence into. Can be a single label,
|
||||
a string of comma-separated labels, or a list of labels.
|
||||
:param multi_label: Whether or not multiple candidate labels can be true.
|
||||
@ -187,8 +190,9 @@ class TransformersZeroShotTextRouter:
|
||||
@component.output_types(documents=Dict[str, str])
|
||||
def run(self, text: str):
|
||||
"""
|
||||
Run the TransformersZeroShotTextRouter. This method routes the text to one of the different edges based on which label
|
||||
it has been categorized into.
|
||||
Run the TransformersZeroShotTextRouter.
|
||||
|
||||
This method routes the text to one of the different edges based on which label it has been categorized into.
|
||||
|
||||
:param text: A str to route to one of the different edges.
|
||||
:returns:
|
||||
|
||||
@ -56,6 +56,7 @@ class TopPSampler:
|
||||
def run(self, documents: List[Document], top_p: Optional[float] = None):
|
||||
"""
|
||||
Filters documents using top-p sampling based on their scores.
|
||||
|
||||
If the specified top_p results in no documents being selected (especially in cases of a low top_p value), the
|
||||
method returns the document with the highest similarity score.
|
||||
|
||||
@ -113,6 +114,7 @@ class TopPSampler:
|
||||
def _collect_scores(self, documents: List[Document]) -> List[float]:
|
||||
"""
|
||||
Collect the scores from the documents' metadata.
|
||||
|
||||
:param documents: List of Documents.
|
||||
:return: List of scores.
|
||||
"""
|
||||
|
||||
@ -77,6 +77,8 @@ class JsonSchemaValidator:
|
||||
|
||||
def __init__(self, json_schema: Optional[Dict[str, Any]] = None, error_template: Optional[str] = None):
|
||||
"""
|
||||
Initialize the JsonSchemaValidator component.
|
||||
|
||||
:param json_schema: A dictionary representing the [JSON schema](https://json-schema.org/) against which
|
||||
the messages' content is validated.
|
||||
:param error_template: A custom template string for formatting the error message in case of validation failure.
|
||||
@ -186,8 +188,9 @@ class JsonSchemaValidator:
|
||||
|
||||
def _recursive_json_to_object(self, data: Any) -> Any:
|
||||
"""
|
||||
Recursively traverses a data structure (dictionary or list), converting any string values
|
||||
that are valid JSON objects into dictionary objects, and returns a new data structure.
|
||||
Convert any string values that are valid JSON objects into dictionary objects.
|
||||
|
||||
Returns a new data structure.
|
||||
|
||||
:param data: The data structure to be traversed.
|
||||
:return: A new data structure with JSON strings converted to dictionary objects.
|
||||
|
||||
@ -41,6 +41,8 @@ class SearchApiWebSearch:
|
||||
search_params: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the SearchApiWebSearch component.
|
||||
|
||||
:param api_key: API key for the SearchApi API
|
||||
:param top_k: Number of documents to return.
|
||||
:param allowed_domains: List of domains to limit the search to.
|
||||
|
||||
@ -44,6 +44,8 @@ class SerperDevWebSearch:
|
||||
search_params: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the SerperDevWebSearch component.
|
||||
|
||||
:param api_key: API key for the Serper API.
|
||||
:param top_k: Number of documents to return.
|
||||
:param allowed_domains: List of domains to limit the search to.
|
||||
|
||||
@ -50,6 +50,7 @@ class DocumentWriter:
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Serializes the component to a dictionary.
|
||||
|
||||
:returns:
|
||||
Dictionary with serialized data.
|
||||
"""
|
||||
|
||||
@ -93,10 +93,10 @@ _COMPONENT_PRE_INIT_CALLBACK: ContextVar[Optional[Callable]] = ContextVar("compo
|
||||
@contextmanager
|
||||
def _hook_component_init(callback: Callable):
|
||||
"""
|
||||
Context manager to set a callback that will be invoked
|
||||
before a component's constructor is called. The callback
|
||||
receives the component class and the init parameters (as keyword
|
||||
arguments) and can modify the init parameters in place.
|
||||
Context manager to set a callback that will be invoked before a component's constructor is called.
|
||||
|
||||
The callback receives the component class and the init parameters (as keyword arguments) and can modify the init
|
||||
parameters in place.
|
||||
|
||||
:param callback:
|
||||
Callback function to invoke.
|
||||
@ -165,8 +165,7 @@ class ComponentMeta(type):
|
||||
|
||||
def __call__(cls, *args, **kwargs):
|
||||
"""
|
||||
This method is called when clients instantiate a Component and
|
||||
runs before __new__ and __init__.
|
||||
This method is called when clients instantiate a Component and runs before __new__ and __init__.
|
||||
"""
|
||||
# This will call __new__ then __init__, giving us back the Component instance
|
||||
pre_init_hook = _COMPONENT_PRE_INIT_CALLBACK.get()
|
||||
@ -234,6 +233,7 @@ class ComponentMeta(type):
|
||||
def _component_repr(component: Component) -> str:
|
||||
"""
|
||||
All Components override their __repr__ method with this one.
|
||||
|
||||
It prints the component name and the input/output sockets.
|
||||
"""
|
||||
result = object.__repr__(component)
|
||||
@ -325,8 +325,7 @@ class _Component:
|
||||
|
||||
def set_output_types(self, instance, **types):
|
||||
"""
|
||||
Method that specifies the output types when the 'run' method is not decorated
|
||||
with 'component.output_types'.
|
||||
Method that specifies the output types when the 'run' method is not decorated with 'component.output_types'.
|
||||
|
||||
Use as:
|
||||
|
||||
@ -364,6 +363,8 @@ class _Component:
|
||||
|
||||
def output_types_decorator(run_method):
|
||||
"""
|
||||
Decorator that sets the output types of the decorated method.
|
||||
|
||||
This happens at class creation time, and since we don't have the decorated
|
||||
class available here, we temporarily store the output types as an attribute of
|
||||
the decorated method. The ComponentMeta metaclass will use this data to create
|
||||
@ -390,9 +391,9 @@ class _Component:
|
||||
|
||||
def copy_class_namespace(namespace):
|
||||
"""
|
||||
This is the callback that `typing.new_class` will use
|
||||
to populate the newly created class. We just copy
|
||||
the whole namespace from the decorated class.
|
||||
This is the callback that `typing.new_class` will use to populate the newly created class.
|
||||
|
||||
Simply copy the whole namespace from the decorated class.
|
||||
"""
|
||||
for key, val in dict(cls.__dict__).items():
|
||||
# __dict__ and __weakref__ are class-bound, we should let Python recreate them.
|
||||
|
||||
@ -102,8 +102,8 @@ def _to_mermaid_text(graph: networkx.MultiDiGraph) -> str:
|
||||
"""
|
||||
Converts a Networkx graph into Mermaid syntax.
|
||||
|
||||
The output of this function can be used in the documentation with `mermaid` codeblocks, and it will
|
||||
be automatically rendered.
|
||||
The output of this function can be used in the documentation with `mermaid` codeblocks and will be
|
||||
automatically rendered.
|
||||
"""
|
||||
# Copy the graph to avoid modifying the original
|
||||
graph = _prepare_for_drawing(graph.copy())
|
||||
|
||||
@ -241,10 +241,11 @@ class Pipeline:
|
||||
callbacks: Optional[DeserializationCallbacks] = None,
|
||||
) -> "Pipeline":
|
||||
"""
|
||||
Creates a `Pipeline` object from the string representation read from the file-like object passed in the `fp` argument.
|
||||
Creates a `Pipeline` object a string representation.
|
||||
|
||||
The string representation is read from the file-like object passed in the `fp` argument.
|
||||
|
||||
|
||||
:param data:
|
||||
The string representation of the pipeline, can be `str`, `bytes` or `bytearray`.
|
||||
:param fp:
|
||||
A file-like object ready to be read from.
|
||||
:param marshaller:
|
||||
@ -312,7 +313,7 @@ class Pipeline:
|
||||
Connects two components together.
|
||||
|
||||
All components to connect must exist in the pipeline.
|
||||
If connecting to an component that has several output connections, specify the inputs and output names as
|
||||
If connecting to a component that has several output connections, specify the inputs and output names as
|
||||
'component_name.connections_name'.
|
||||
|
||||
:param sender:
|
||||
@ -598,6 +599,8 @@ class Pipeline:
|
||||
|
||||
def _validate_input(self, data: Dict[str, Any]):
|
||||
"""
|
||||
Validates pipeline input data.
|
||||
|
||||
Validates that data:
|
||||
* Each Component name actually exists in the Pipeline
|
||||
* Each Component is not missing any input
|
||||
@ -1047,6 +1050,8 @@ class Pipeline:
|
||||
|
||||
def _prepare_component_input_data(self, data: Dict[str, Any]) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Any]]:
|
||||
"""
|
||||
Prepares input data for pipeline components.
|
||||
|
||||
Organizes input data for pipeline components and identifies any inputs that are not matched to any
|
||||
component's input slots.
|
||||
|
||||
|
||||
@ -22,9 +22,10 @@ class PredefinedPipeline(Enum):
|
||||
|
||||
class PipelineTemplate:
|
||||
"""
|
||||
The PipelineTemplate class enables the straightforward creation of flexible and configurable pipelines using Jinja2 templated YAML files.
|
||||
The PipelineTemplate enables the creation of flexible and configurable pipelines.
|
||||
|
||||
Specifically designed to simplify the setup of complex data processing pipelines for
|
||||
The PipelineTemplate class enables the straightforward creation of flexible and configurable pipelines using
|
||||
Jinja2 templated YAML files. Specifically designed to simplify the setup of complex data processing pipelines for
|
||||
a range of NLP tasks—including question answering, retriever augmented generation (RAG), document indexing, among
|
||||
others - PipelineTemplate empowers users to dynamically generate pipeline configurations from templates and
|
||||
customize components as necessary. Its design philosophy centers on providing an accessible, yet powerful, tool
|
||||
@ -63,9 +64,9 @@ class PipelineTemplate:
|
||||
"""
|
||||
Initialize a PipelineTemplate.
|
||||
|
||||
Besides calling the constructor directly, a set of utility methods is provided
|
||||
for conveniently create an instance of `PipelineTemplate` from different sources. See `from_string`,
|
||||
`from_file`, `from_predefined` and `from_url`.
|
||||
Besides calling the constructor directly, a set of utility methods is provided to conveniently create an
|
||||
instance of `PipelineTemplate` from different sources. See `from_string`, `from_file`, `from_predefined`
|
||||
and `from_url`.
|
||||
|
||||
:param template_content: The raw template source to use in the template.
|
||||
"""
|
||||
@ -106,7 +107,9 @@ class PipelineTemplate:
|
||||
@classmethod
|
||||
def from_predefined(cls, predefined_pipeline: PredefinedPipeline) -> "PipelineTemplate":
|
||||
"""
|
||||
Create a PipelineTemplate from a predefined template. See `PredefinedPipeline` for available options.
|
||||
Create a PipelineTemplate from a predefined template.
|
||||
|
||||
See `PredefinedPipeline` for available options.
|
||||
|
||||
:param predefined_pipeline: The predefined pipeline to use.
|
||||
:returns: An instance of `PipelineTemplate `.
|
||||
|
||||
@ -123,8 +123,7 @@ def default_to_dict(obj: Any, **init_parameters) -> Dict[str, Any]:
|
||||
"""
|
||||
Utility function to serialize an object to a dictionary.
|
||||
|
||||
This is mostly necessary for Components, but it can be used by any object.
|
||||
|
||||
This is mostly necessary for components but can be used by any object.
|
||||
`init_parameters` are parameters passed to the object class `__init__`.
|
||||
They must be defined explicitly as they'll be used when creating a new
|
||||
instance of `obj` with `from_dict`. Omitting them might cause deserialisation
|
||||
@ -165,7 +164,7 @@ def default_from_dict(cls: Type[object], data: Dict[str, Any]) -> Any:
|
||||
"""
|
||||
Utility function to deserialize a dictionary to an object.
|
||||
|
||||
This is mostly necessary for Components but, it can be used by any object.
|
||||
This is mostly necessary for components but can be used by any object.
|
||||
|
||||
The function will raise a `DeserializationError` if the `type` field in `data` is
|
||||
missing or it doesn't match the type of `cls`.
|
||||
|
||||
@ -8,7 +8,7 @@ class SparseEmbedding:
|
||||
|
||||
def __init__(self, indices: List[int], values: List[float]):
|
||||
"""
|
||||
Initialize a sparse embedding.
|
||||
Initialize a SparseEmbedding object.
|
||||
|
||||
:param indices: List of indices of non-zero elements in the embedding.
|
||||
:param values: List of values of non-zero elements in the embedding.
|
||||
@ -22,7 +22,7 @@ class SparseEmbedding:
|
||||
|
||||
def to_dict(self):
|
||||
"""
|
||||
Convert the sparse embedding to a dictionary.
|
||||
Convert the SparseEmbedding object to a dictionary.
|
||||
|
||||
:returns:
|
||||
Serialized sparse embedding.
|
||||
@ -32,7 +32,7 @@ class SparseEmbedding:
|
||||
@classmethod
|
||||
def from_dict(cls, sparse_embedding_dict):
|
||||
"""
|
||||
Deserializes the sparse embedding from a dictionary.
|
||||
Deserializes the sparse embedding from a dictionary.
|
||||
|
||||
:param sparse_embedding_dict:
|
||||
Dictionary to deserialize from.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user