docs: Update docstrings and API doc config for haystack.components.extractors (#7251)

* docs: Update docstrings and API doc config for `haystack.components.extractors` * Review comments
2026-01-07 04:27:15 +00:00 · 2024-03-01 12:45:26 +01:00 · 2024-03-01 12:45:26 +01:00 · b554e4cd49
commit b554e4cd49
parent a57ed6dafe
2 changed files with 57 additions and 26 deletions
--- a/docs/pydoc/config/extractors_api.yml
+++ b/docs/pydoc/config/extractors_api.yml
@ -9,6 +9,8 @@ processors:
    documented_only: true
    do_not_filter_modules: false
    skip_empty_modules: true
+  - type: filter
+    expression: "name not in ['_BackendEnumMeta', '_NerBackend', '_HfBackend', '_SpacyBackend']"
  - type: smart
  - type: crossref
 renderer:
--- a/haystack/components/extractors/named_entity_extractor.py
+++ b/haystack/components/extractors/named_entity_extractor.py
@ -43,13 +43,9 @@ class NamedEntityExtractorBackend(Enum, metaclass=_BackendEnumMeta):
    NLP backend to use for Named Entity Recognition.
    """

-    #: Hugging Face.
-    #:
    #: Uses an Hugging Face model and pipeline.
    HUGGING_FACE = "hugging_face"

-    #: spaCy.
-    #:
    #: Uses a spaCy model and pipeline.
    SPACY = "spacy"

@ -77,6 +73,33 @@ class NamedEntityAnnotation:

@component
 class NamedEntityExtractor:
+    """
+    Annotates named entities in a collection of documents.
+
+    The component supports two backends: Hugging Face and spaCy. The
+    former can be used with any sequence classification model from the
+    [Hugging Face model hub](https://huggingface.co/models), while the
+    latter can be used with any [spaCy model](https://spacy.io/models)
+    that contains an NER component. Annotations are stored as metadata
+    in the documents.
+
+    Usage example:
+    ```python
+    from haystack import Document
+    from haystack.components.extractors.named_entity_extractor import NamedEntityExtractor
+
+    documents = [
+        Document(content="I'm Merlin, the happy pig!"),
+        Document(content="My name is Clara and I live in Berkeley, California."),
+    ]
+    extractor = NamedEntityExtractor(backend="hugging_face", model="dslim/bert-base-NER")
+    extractor.warm_up()
+    results = extractor.run(documents=documents)["documents"]
+    annotations = [NamedEntityExtractor.get_stored_annotations(doc) for doc in results]
+    print(annotations)
+    ```
+    """
+
    _METADATA_KEY = "named_entities"

    def __init__(
@ -88,27 +111,22 @@ class NamedEntityExtractor:
        device: Optional[ComponentDevice] = None,
    ) -> None:
        """
-        Construct a Named Entity extractor component.
+        Create a Named Entity extractor component.

        :param backend:
            Backend to use for NER.
        :param model:
            Name of the model or a path to the model on
-            the local disk.
-
-            Dependent on the backend.
+            the local disk. Dependent on the backend.
        :param pipeline_kwargs:
            Keyword arguments passed to the pipeline. The
-            pipeline can override these arguments.
-
-            Dependent on the backend.
+            pipeline can override these arguments. Dependent on the backend.
        :param device:
            The device on which the model is loaded. If `None`,
-            the default device is automatically selected.
-
-            If a device/device map is specified in `pipeline_kwargs`,
-            it overrides this parameter (only applicable to the HuggingFace
-            backend).
+            the default device is automatically selected. If a
+            device/device map is specified in `pipeline_kwargs`,
+            it overrides this parameter (only applicable to the
+            HuggingFace backend).
        """

        if isinstance(backend, str):
@ -126,7 +144,10 @@ class NamedEntityExtractor:

    def warm_up(self):
        """
-        Initialize the named entity extractor backend.
+        Initialize the component.
+
+        :raises ComponentError:
+            If the backend fails to initialize successfully.
        """
        try:
            self._backend.initialize()
@ -138,14 +159,17 @@ class NamedEntityExtractor:
    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document], batch_size: int = 1) -> Dict[str, Any]:
        """
-         Run the named-entity extractor.
+        Annotate named entities in each document and store
+        the annotations in the document's metadata.

        :param documents:
-             Documents to process.
+            Documents to process.
        :param batch_size:
-             Batch size used for processing the documents.
+            Batch size used for processing the documents.
        :returns:
-             The processed documents.
+            Processed documents.
+        :raises ComponentError:
+            If the backend fails to process a document.
        """
        texts = [doc.content if doc.content is not None else "" for doc in documents]
        annotations = self._backend.annotate(texts, batch_size=batch_size)
@ -163,7 +187,10 @@ class NamedEntityExtractor:

    def to_dict(self) -> Dict[str, Any]:
        """
-        Serialize this component to a dictionary.
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
        """
        return default_to_dict(
            self,
@ -176,10 +203,12 @@ class NamedEntityExtractor:
    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "NamedEntityExtractor":
        """
-        Deserialize the component from a dictionary.
+        Deserializes the component from a dictionary.

        :param data:
-            The dictionary to deserialize from.
+            Dictionary to deserialize from.
+        :returns:
+            Deserialized component.
        """
        try:
            init_params = data["init_parameters"]
@ -266,8 +295,8 @@ class _NerBackend(ABC):
    @property
    def device(self) -> ComponentDevice:
        """
-        Returns the identifier of the device on which
-        the backend's model is loaded.
+        :returns:
+            The device on which the backend's model is loaded.
        """
        return self._device