Create EntityExtractor (#1573)

* Create extractor/entity.py * Aggregate NER words into entities * Support indexing * Add doc strings * Add utility for printing * Update signature of run() to match BaseComponent * Add test * Modify simplify_ner_for_qa to return the dictionary and add its test Co-authored-by: brandenchan <brandenchan@icloud.com>
2025-10-26 23:38:58 +00:00 · 2021-10-11 11:04:11 +02:00 · 2021-10-11 11:04:11 +02:00 · 25d76f508d
commit 25d76f508d
parent 69a0c9f2ed
3 changed files with 133 additions and 0 deletions
--- a/haystack/extractor/init.py
+++ b/haystack/extractor/init.py
@ -0,0 +1 @@
 from haystack.extractor.entity import EntityExtractor, simplify_ner_for_qa
--- a/haystack/extractor/entity.py
+++ b/haystack/extractor/entity.py
@ -0,0 +1,75 @@
 from typing import List, Union, Dict, Optional, Tuple
 import json
 from haystack import BaseComponent, Document, MultiLabel
 from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
 from transformers import pipeline
 class EntityExtractor(BaseComponent):
    """
    This node is used to extract entities out of documents.
    The most common use case for this would be as a named entity extractor.
    The default model used is dslim/bert-base-NER.
    This node can be placed in a querying pipeline to perform entity extraction on retrieved documents only,
    or it can be placed in an indexing pipeline so that all documents in the document store have extracted entities.
    The entities extracted by this Node will populate Document.entities
    """
    outgoing_edges = 1
    def __init__(self,
                 model_name_or_path="dslim/bert-base-NER"):
        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        token_classifier = AutoModelForTokenClassification.from_pretrained(model_name_or_path)
        self.model = pipeline("ner", model=token_classifier, tokenizer=tokenizer, aggregation_strategy="simple")
    def run(self, documents: Optional[Union[List[Document], List[dict]]] = None) -> Tuple[Dict, str]:  # type: ignore
        """
        This is the method called when this node is used in a pipeline
        """
        if documents:
            for doc in documents:
                # In a querying pipeline, doc is a haystack.schema.Document object
                try:
                    doc.meta["entities"] = self.extract(doc.text)  # type: ignore
                # In an indexing pipeline, doc is a dictionary
                except AttributeError:
                    doc["meta"]["entities"] = self.extract(doc["text"])  # type: ignore
        output = {"documents": documents}
        return output, "output_1"
    def extract(self, text):
        """
        This function can be called to perform entity extraction when using the node in isolation.
        """
        entities = self.model(text)
        return entities
 def simplify_ner_for_qa(output): 
    """
    Returns a simplified version of the output dictionary
    with the following structure:
    [
        { 
            answer: { ... }
            entities: [ { ... }, {} ]
        }
    ]
    The entities included are only the ones that overlap with
    the answer itself.
    """
    compact_output = []
    for answer in output["answers"]:
        entities = []
        for entity in answer["meta"]["entities"]:
            if entity["start"] >= answer["offset_start_in_doc"] and entity["end"] <= answer["offset_end_in_doc"]:
                entities.append(entity["word"])  
        compact_output.append({
            "answer": answer["answer"],
            "entities": entities
        })
    return compact_output
--- a/test/test_extractor.py
+++ b/test/test_extractor.py
@ -0,0 +1,57 @@
 import pytest
 from haystack.retriever.sparse import ElasticsearchRetriever
 from haystack.reader import FARMReader
 from haystack.pipeline import Pipeline
 from haystack.extractor import EntityExtractor, simplify_ner_for_qa
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
 def test_extractor(document_store_with_docs):
    es_retriever = ElasticsearchRetriever(document_store=document_store_with_docs)
    ner = EntityExtractor()
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
    pipeline = Pipeline()
    pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
    pipeline.add_node(component=ner, name="NER", inputs=["ESRetriever"])
    pipeline.add_node(component=reader, name="Reader", inputs=["NER"])
    prediction = pipeline.run(
        query="Who lives in Berlin?", 
        params={
            "ESRetriever": {"top_k": 1}, 
            "Reader": {"top_k": 1},
        }
    )
    entities = [entity["word"] for entity in prediction["answers"][0]["meta"]["entities"]]
    assert "Carla" in entities
    assert "Berlin" in entities
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
 def test_extractor_output_simplifier(document_store_with_docs):
    es_retriever = ElasticsearchRetriever(document_store=document_store_with_docs)
    ner = EntityExtractor()
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
    pipeline = Pipeline()
    pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
    pipeline.add_node(component=ner, name="NER", inputs=["ESRetriever"])
    pipeline.add_node(component=reader, name="Reader", inputs=["NER"])
    prediction = pipeline.run(
        query="Who lives in Berlin?", 
        params={
            "ESRetriever": {"top_k": 1}, 
            "Reader": {"top_k": 1},
        }
    )
    simplified = simplify_ner_for_qa(prediction)
    assert simplified[0] == {
        "answer": "Carla",
        "entities": ["Carla"]
    }
		`@ -0,0 +1 @@`
							`from haystack.extractor.entity import EntityExtractor, simplify_ner_for_qa`