Docs: Update language classifier docstrings (#4413)

* Update language classifier docstrings

* Apply suggestions from code review

---------

Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>
This commit is contained in:
Agnieszka Marzec 2023-03-17 13:40:02 +02:00 committed by GitHub
parent f04b2f3cee
commit 26e0fbb4f8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 47 additions and 50 deletions

View File

@ -12,7 +12,7 @@ DEFAULT_LANGUAGES = ["en", "de", "es", "cs", "nl"]
class BaseDocumentLanguageClassifier(BaseComponent): class BaseDocumentLanguageClassifier(BaseComponent):
""" """
Abstract class for Document Language Classifiers Abstract class for Document Language Classifiers.
""" """
outgoing_edges = len(DEFAULT_LANGUAGES) outgoing_edges = len(DEFAULT_LANGUAGES)
@ -27,8 +27,8 @@ class BaseDocumentLanguageClassifier(BaseComponent):
def __init__(self, route_by_language: bool = True, languages_to_route: Optional[List[str]] = None): def __init__(self, route_by_language: bool = True, languages_to_route: Optional[List[str]] = None):
""" """
:param route_by_language: whether to send Documents on a different output edge depending on their language. :param route_by_language: Routes Documents to a different output edge depending on their language.
:param languages_to_route: list of languages, each corresponding to a different output edge (ISO code, see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)). :param languages_to_route: A list of languages in ISO code, each corresponding to a different output edge (see [langdetect documentation](https://github.com/Mimino666/langdetect#languages)).
""" """
super().__init__() super().__init__()
@ -36,13 +36,12 @@ class BaseDocumentLanguageClassifier(BaseComponent):
languages_to_route = DEFAULT_LANGUAGES languages_to_route = DEFAULT_LANGUAGES
if route_by_language is True: if route_by_language is True:
logger.info( logger.info(
"languages_to_route list has not been defined. The default list will be used: %s", "The languages_to_route list is not defined. The default list will be used: %s", languages_to_route
languages_to_route,
) )
if len(set(languages_to_route)) != len(languages_to_route): if len(set(languages_to_route)) != len(languages_to_route):
duplicates = {lang for lang in languages_to_route if languages_to_route.count(lang) > 1} duplicates = {lang for lang in languages_to_route if languages_to_route.count(lang) > 1}
raise ValueError(f"languages_to_route parameter can't contain duplicate values ({duplicates}).") raise ValueError(f"The languages_to_route parameter can't contain duplicate values ({duplicates}).")
self.route_by_language = route_by_language self.route_by_language = route_by_language
self.languages_to_route = languages_to_route self.languages_to_route = languages_to_route
@ -62,7 +61,7 @@ class BaseDocumentLanguageClassifier(BaseComponent):
""" """
Run language document classifier on a list of documents. Run language document classifier on a list of documents.
:param documents: list of documents to detect language. :param documents: A list of documents whose language you want to detect.
""" """
docs_with_languages = self.predict(documents=documents) docs_with_languages = self.predict(documents=documents)
output = {"documents": docs_with_languages} output = {"documents": docs_with_languages}
@ -75,7 +74,7 @@ class BaseDocumentLanguageClassifier(BaseComponent):
unique_languages = list(set(languages)) unique_languages = list(set(languages))
if len(unique_languages) > 1: if len(unique_languages) > 1:
raise ValueError( raise ValueError(
f"If route_by_language parameter is True, Documents of multiple languages ({unique_languages}) are not allowed together. " f"If the route_by_language parameter is True, Documents of multiple languages ({unique_languages}) are not allowed together. "
"If you want to route documents by language, you can call Pipeline.run() once for each Document." "If you want to route documents by language, you can call Pipeline.run() once for each Document."
) )
language = unique_languages[0] language = unique_languages[0]
@ -97,7 +96,7 @@ class BaseDocumentLanguageClassifier(BaseComponent):
""" """
Run language document classifier on batches of documents. Run language document classifier on batches of documents.
:param documents: list of lists of documents to detect language. :param documents: A list of lists of documents whose language you want to detect.
""" """
docs_lists_with_languages = self.predict_batch(documents=documents, batch_size=batch_size) docs_lists_with_languages = self.predict_batch(documents=documents, batch_size=batch_size)
@ -115,13 +114,13 @@ class BaseDocumentLanguageClassifier(BaseComponent):
unique_languages = list(set(languages)) unique_languages = list(set(languages))
if len(unique_languages) > 1: if len(unique_languages) > 1:
raise ValueError( raise ValueError(
f"If route_by_language parameter is True, Documents of multiple languages ({unique_languages}) are not allowed together. " f"If the route_by_language parameter is True, Documents of multiple languages ({unique_languages}) are not allowed together. "
"If you want to route documents by language, you can call Pipeline.run() once for each Document." "If you want to route documents by language, you can call Pipeline.run() once for each Document."
) )
if unique_languages[0] is None: if unique_languages[0] is None:
logger.warning( logger.warning(
"The model cannot detect the language of some of the documents." "The model cannot detect the language of some of the documents."
"The first language in the list of supported languages will be used to route the document: %s", "The first language in the list of supported languages will be used to route the documents: %s",
self.languages_to_route[0], self.languages_to_route[0],
) )
language: Optional[str] = self.languages_to_route[0] language: Optional[str] = self.languages_to_route[0]
@ -129,7 +128,7 @@ class BaseDocumentLanguageClassifier(BaseComponent):
if language not in self.languages_to_route: if language not in self.languages_to_route:
raise ValueError( raise ValueError(
f"'{language}' is not in the list of languages to route ({', '.join(self.languages_to_route)})." f"'{language}' is not in the list of languages to route ({', '.join(self.languages_to_route)})."
f"You should specify them when initializing the node, using the parameter languages_to_route." f"Specify them when initializing the node, using the parameter languages_to_route."
) )
edge_name = self._get_edge_from_language(str(language)) edge_name = self._get_edge_from_language(str(language))

View File

@ -11,13 +11,13 @@ logger = logging.getLogger(__name__)
class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier): class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
""" """
Node based on the lightweight and fast [langdetect library](https://github.com/Mimino666/langdetect) for document language classification. A node based on the lightweight and fast [langdetect library](https://github.com/Mimino666/langdetect) for classifying the language of documents.
This node detects the language of Documents and adds the output to the Documents metadata. This node detects the language of Documents and adds the output to the Documents metadata.
The meta field of the Document is a dictionary with the following format: The meta field of the Document is a dictionary with the following format:
``'meta': {'name': '450_Baelor.txt', 'language': 'en'}`` ``'meta': {'name': '450_Baelor.txt', 'language': 'en'}``
- Using the document language classifier, you can directly get predictions via predict() - Using the document language classifier, you can directly get predictions with `predict()`.
- You can flow the Documents to different branches depending on their language, - You can route the Documents to different branches depending on their language
by setting the `route_by_language` parameter to True and specifying the `languages_to_route` parameter. by setting the `route_by_language` parameter to `True` and specifying the `languages_to_route` parameter.
**Usage example** **Usage example**
```python ```python
... ...
@ -46,16 +46,16 @@ class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
def __init__(self, route_by_language: bool = True, languages_to_route: Optional[List[str]] = None): def __init__(self, route_by_language: bool = True, languages_to_route: Optional[List[str]] = None):
""" """
:param route_by_language: whether to send Documents on a different output edge depending on their language. :param route_by_language: Sends Documents to a different output edge depending on their language.
:param languages_to_route: list of languages, each corresponding to a different output edge (ISO code, see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)). :param languages_to_route: A list of languages in ISO code, each corresponding to a different output edge (see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)).
""" """
super().__init__(route_by_language=route_by_language, languages_to_route=languages_to_route) super().__init__(route_by_language=route_by_language, languages_to_route=languages_to_route)
def predict(self, documents: List[Document], batch_size: Optional[int] = None) -> List[Document]: def predict(self, documents: List[Document], batch_size: Optional[int] = None) -> List[Document]:
""" """
Detect the languge of Documents and add the output to the Documents metadata. Detect the language of Documents and add the output to the Documents metadata.
:param documents: list of Documents to detect language. :param documents: A list of Documents whose language you want to detect.
:return: List of Documents, where Document.meta["language"] contains the predicted language :return: List of Documents, where Document.meta["language"] contains the predicted language.
""" """
if len(documents) == 0: if len(documents) == 0:
raise ValueError( raise ValueError(
@ -79,8 +79,8 @@ class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
def predict_batch(self, documents: List[List[Document]], batch_size: Optional[int] = None) -> List[List[Document]]: def predict_batch(self, documents: List[List[Document]], batch_size: Optional[int] = None) -> List[List[Document]]:
""" """
Detect the documents language and add the output to the document's meta data. Detect the Document's language and add the output to the Document's meta data.
:param documents: list of lists of Documents to detect language. :param documents: A list of lists of Documents to detect language.
:return: List of lists of Documents, where Document.meta["language"] contains the predicted language :return: List of lists of Documents, where Document.meta["language"] contains the predicted language
""" """
if len(documents) == 0 or all(len(docs_list) == 0 for docs_list in documents): if len(documents) == 0 or all(len(docs_list) == 0 for docs_list in documents):

View File

@ -15,14 +15,13 @@ logger = logging.getLogger(__name__)
class TransformersDocumentLanguageClassifier(BaseDocumentLanguageClassifier): class TransformersDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
""" """
Transformer based model for document language classification using the HuggingFace's transformers framework Transformer-based model for classifying the document language using the Hugging Face's [transformers framework](https://github.com/huggingface/transformers).
(https://github.com/huggingface/transformers).
While the underlying model can vary (BERT, Roberta, DistilBERT ...), the interface remains the same. While the underlying model can vary (BERT, Roberta, DistilBERT ...), the interface remains the same.
This node detects the language of Documents and adds the output to the Documents metadata. This node detects the language of Documents and adds the output to the Documents metadata.
The meta field of the Document is a dictionary with the following format: The meta field of the Document is a dictionary with the following format:
``'meta': {'name': '450_Baelor.txt', 'language': 'en'}`` ``'meta': {'name': '450_Baelor.txt', 'language': 'en'}``
- Using the document language classifier, you can directly get predictions via predict() - Using the document language classifier, you can directly get predictions with the `predict()` method.
- You can flow the Documents to different branches depending on their language, - You can route the Documents to different branches depending on their language
by setting the `route_by_language` parameter to True and specifying the `languages_to_route` parameter. by setting the `route_by_language` parameter to True and specifying the `languages_to_route` parameter.
**Usage example** **Usage example**
```python ```python
@ -66,28 +65,27 @@ class TransformersDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
): ):
""" """
Load a language detection model from Transformers. Load a language detection model from Transformers.
See https://huggingface.co/models for full list of available models. For a full list of available models, see [Hugging Face models](https://huggingface.co/models).
Language detection models: https://huggingface.co/models?search=language%20detection For language detection models, see [Language Detection models](https://huggingface.co/models?search=language%20detection) on Hugging Face.
:param route_by_language: whether to send Documents on a different output edge depending on their language. :param route_by_language: Sends Documents to a different output edge depending on their language.
:param languages_to_route: list of languages, each corresponding to a different output edge (for the list of the supported languages, see the model card of the chosen model). :param languages_to_route: A list of languages, each corresponding to a different output edge (for the list of supported languages, see the model card of the chosen model).
:param labels_to_languages_mapping: some Transformers models do not return language names but generic labels. In this case, you can provide a mapping indicating a language for each label. For example: {"LABEL_1": "ar", "LABEL_2": "bg", ...}. :param labels_to_languages_mapping: Some Transformers models return generic labels instead of language names. In this case, you can provide a mapping indicating a language for each label. For example: {"LABEL_1": "ar", "LABEL_2": "bg", ...}.
:param model_name_or_path: Directory of a saved model or the name of a public model e.g. 'papluca/xlm-roberta-base-language-detection'. :param model_name_or_path: Directory of a saved model or the name of a public model, for example 'papluca/xlm-roberta-base-language-detection'.
See https://huggingface.co/models for full list of available models. See [Hugging Face models](https://huggingface.co/models) for a full list of available models.
:param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :param model_version: The version of the model to use from the Hugging Face model hub. Can be a tag name, a branch name, or a commit hash.
:param tokenizer: Name of the tokenizer (usually the same as model) :param tokenizer: Name of the tokenizer (usually the same as model).
:param use_gpu: Whether to use GPU (if available). :param use_gpu: Whether to use GPU (if available).
:param batch_size: Number of Documents to be processed at a time. :param batch_size: Number of Documents to be processed at a time.
:param progress_bar: Whether to show a progress bar while processing. :param progress_bar: Whether to show a progress bar while processing.
:param use_auth_token: The API token used to download private models from Huggingface. :param use_auth_token: The API token used to download private models from Hugging Face.
If this parameter is set to `True`, then the token generated when running If set to `True`, the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used. `transformers-cli login` (stored in ~/.huggingface) is used.
Additional information can be found here For more information, see [Hugging Face documentation](https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained).
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained :param devices: List of torch devices (for example, cuda, cpu, mps) to limit inference to specific devices.
:param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. A list containing torch device objects or strings is supported (for example
A list containing torch device objects and/or strings is supported (For example [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False`, the devices
[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
parameter is not used and a single cpu device is used for inference. parameter is not used and a single cpu device is used for inference.
""" """
@ -118,10 +116,10 @@ class TransformersDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
def predict(self, documents: List[Document], batch_size: Optional[int] = None) -> List[Document]: def predict(self, documents: List[Document], batch_size: Optional[int] = None) -> List[Document]:
""" """
Detect the languge of Documents and add the output to the Documents metadata. Detect the language of Documents and add the output to the Documents metadata.
:param documents: list of Documents to detect language. :param documents: A list of Documents whose language you want to detect.
:param batch_size: The number of Documents to classify at a time. :param batch_size: The number of Documents to classify at a time.
:return: List of Documents, where Document.meta["language"] contains the predicted language :return: A list of Documents, where Document.meta["language"] contains the predicted language.
""" """
if len(documents) == 0: if len(documents) == 0:
raise ValueError( raise ValueError(
@ -148,9 +146,9 @@ class TransformersDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
def predict_batch(self, documents: List[List[Document]], batch_size: Optional[int] = None) -> List[List[Document]]: def predict_batch(self, documents: List[List[Document]], batch_size: Optional[int] = None) -> List[List[Document]]:
""" """
Detect the documents language and add the output to the document's meta data. Detect the Document's language and add the output to the Document's meta data.
:param documents: list of lists of Documents to detect language. :param documents: A list of lists of Documents whose language you want to detect.
:return: List of lists of Documents, where Document.meta["language"] contains the predicted language :return: A list of lists of Documents where Document.meta["language"] contains the predicted language.
""" """
if len(documents) == 0 or all(len(docs_list) == 0 for docs_list in documents): if len(documents) == 0 or all(len(docs_list) == 0 for docs_list in documents):
raise ValueError( raise ValueError(