Chore: Add a trace logger for NLP output (#561)

* add and config trace logger

* chore: update loggers in partition

* doc: changelog and version

* doc: update changelog

* doc: remove placeholder

* chore: bypass mypy
This commit is contained in:
Yuming Long 2023-05-10 12:16:15 -04:00 committed by GitHub
parent b52638f8e3
commit 0f91a9bfa8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 37 additions and 10 deletions

View File

@ -1,3 +1,13 @@
## 0.6.6-dev0
### Enhancements
* Added an additional trace logger for NLP debugging.
### Features
### Fixes
## 0.6.5 ## 0.6.5
### Enhancements ### Enhancements

View File

@ -1 +1 @@
__version__ = "0.6.5" # pragma: no cover __version__ = "0.6.6-dev0" # pragma: no cover

View File

@ -1,3 +1,18 @@
import logging import logging
logger = logging.getLogger("unstructured") logger = logging.getLogger("unstructured")
trace_logger = logging.getLogger("unstructured.trace")
# Create a custom logging level
DETAIL = 15
logging.addLevelName(DETAIL, "DETAIL")
# Create a custom log method for the "DETAIL" level
def detail(self, message, *args, **kws):
if self.isEnabledFor(DETAIL):
self._log(DETAIL, message, args, **kws)
# Add the custom log method to the logging.Logger class
logging.Logger.detail = detail # type: ignore

View File

@ -10,7 +10,7 @@ else:
from typing import Final from typing import Final
from unstructured.cleaners.core import remove_punctuation from unstructured.cleaners.core import remove_punctuation
from unstructured.logger import logger from unstructured.logger import trace_logger
from unstructured.nlp.english_words import ENGLISH_WORDS from unstructured.nlp.english_words import ENGLISH_WORDS
from unstructured.nlp.patterns import ( from unstructured.nlp.patterns import (
ENDS_IN_PUNCT_RE, ENDS_IN_PUNCT_RE,
@ -57,11 +57,11 @@ def is_possible_narrative_text(
language_checks = _language_checks.lower() == "true" language_checks = _language_checks.lower() == "true"
if len(text) == 0: if len(text) == 0:
logger.debug("Not narrative. Text is empty.") trace_logger.detail("Not narrative. Text is empty.") # type: ignore
return False return False
if text.isnumeric(): if text.isnumeric():
logger.debug(f"Not narrative. Text is all numeric:\n\n{text}") trace_logger.detail(f"Not narrative. Text is all numeric:\n\n{text}") # type: ignore
return False return False
language = os.environ.get("UNSTRUCTURED_LANGUAGE", language) language = os.environ.get("UNSTRUCTURED_LANGUAGE", language)
@ -74,7 +74,7 @@ def is_possible_narrative_text(
os.environ.get("UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD", cap_threshold), os.environ.get("UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD", cap_threshold),
) )
if exceeds_cap_ratio(text, threshold=cap_threshold): if exceeds_cap_ratio(text, threshold=cap_threshold):
logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}") trace_logger.detail(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}") # type: ignore # noqa: E501
return False return False
non_alpha_threshold = float( non_alpha_threshold = float(
@ -84,7 +84,7 @@ def is_possible_narrative_text(
return False return False
if (sentence_count(text, 3) < 2) and (not contains_verb(text)) and language == "en": if (sentence_count(text, 3) < 2) and (not contains_verb(text)) and language == "en":
logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}") trace_logger.detail(f"Not narrative. Text does not contain a verb:\n\n{text}") # type: ignore # noqa: E501
return False return False
return True return True
@ -121,7 +121,7 @@ def is_possible_title(
language_checks = _language_checks.lower() == "true" language_checks = _language_checks.lower() == "true"
if len(text) == 0: if len(text) == 0:
logger.debug("Not a title. Text is empty.") trace_logger.detail("Not a title. Text is empty.") # type: ignore
return False return False
if text.isupper() and ENDS_IN_PUNCT_RE.search(text) is not None: if text.isupper() and ENDS_IN_PUNCT_RE.search(text) is not None:
@ -150,14 +150,16 @@ def is_possible_title(
return False return False
if text.isnumeric(): if text.isnumeric():
logger.debug(f"Not a title. Text is all numeric:\n\n{text}") trace_logger.detail(f"Not a title. Text is all numeric:\n\n{text}") # type: ignore
return False return False
# NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS" # NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS"
# that sometimes get tokenized as separate sentences due to the period, but are still # that sometimes get tokenized as separate sentences due to the period, but are still
# valid titles # valid titles
if sentence_count(text, min_length=sentence_min_length) > 1: if sentence_count(text, min_length=sentence_min_length) > 1:
logger.debug(f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}") trace_logger.detail( # type: ignore
f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}",
)
return False return False
return True return True
@ -223,7 +225,7 @@ def sentence_count(text: str, min_length: Optional[int] = None) -> int:
sentence = remove_punctuation(sentence) sentence = remove_punctuation(sentence)
words = [word for word in word_tokenize(sentence) if word != "."] words = [word for word in word_tokenize(sentence) if word != "."]
if min_length and len(words) < min_length: if min_length and len(words) < min_length:
logger.debug( trace_logger.detail( # type: ignore
f"Skipping sentence because does not exceed {min_length} word tokens\n" f"Skipping sentence because does not exceed {min_length} word tokens\n"
f"{sentence}", f"{sentence}",
) )