removed en_web_md from setup (#10839)

* removed en_web_md from setup

* Use Constant

---------

Co-authored-by: ulixius9 <mayursingal9@gmail.com>
This commit is contained in:
NiharDoshi99 2023-03-30 15:13:41 +05:30 committed by GitHub
parent 2e2c6a0cdf
commit 3406c8c868
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 13 additions and 5 deletions

View File

@ -67,10 +67,6 @@ COMMONS = {
# required library for pii tagging
pii_requirements = {
"spacy==3.5.0",
(
"en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/"
"en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl"
),
VERSIONS["pandas"],
"presidio-analyzer==2.2.32",
}

View File

@ -25,6 +25,8 @@ from metadata.utils import fqn
PII = "PII"
SPACY_EN_MODEL = "en_core_web_md"
class PiiTypes(Enum):
"""PiiTypes enumerates the different types of PII data"""
@ -115,6 +117,7 @@ class NERScanner:
"""A scanner that uses Spacy NER for entity recognition"""
def __init__(self, metadata: OpenMetadata):
import spacy # pylint: disable=import-outside-toplevel
from presidio_analyzer import ( # pylint: disable=import-outside-toplevel
AnalyzerEngine,
)
@ -122,10 +125,19 @@ class NERScanner:
SpacyNlpEngine,
)
try:
spacy.load(SPACY_EN_MODEL)
except OSError:
logging.warning("Downloading en_core_web_md language model for the spaCy")
from spacy.cli import download # pylint: disable=import-outside-toplevel
download(SPACY_EN_MODEL)
spacy.load(SPACY_EN_MODEL)
self.metadata = metadata
self.text = ""
self.analyzer = AnalyzerEngine(
nlp_engine=SpacyNlpEngine(models={"en": "en_core_web_md"})
nlp_engine=SpacyNlpEngine(models={"en": SPACY_EN_MODEL})
)
def get_highest_score_label(