From 0f91a9bfa8fded63ba43b4b4125fcf30a011d3d5 Mon Sep 17 00:00:00 2001 From: Yuming Long <63475068+yuming-long@users.noreply.github.com> Date: Wed, 10 May 2023 12:16:15 -0400 Subject: [PATCH] Chore: Add a trace logger for NLP output (#561) * add and config trace logger * chore: update loggers in partition * doc: changelog and version * doc: update changelog * doc: remove placeholder * chore: bypass mypy --- CHANGELOG.md | 10 ++++++++++ unstructured/__version__.py | 2 +- unstructured/logger.py | 15 +++++++++++++++ unstructured/partition/text_type.py | 20 +++++++++++--------- 4 files changed, 37 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ce62c2e75..f7e5c03a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.6.6-dev0 + +### Enhancements + +* Added an additional trace logger for NLP debugging. + +### Features + +### Fixes + ## 0.6.5 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 36225a4ca..d7d6a9bd3 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.6.5" # pragma: no cover +__version__ = "0.6.6-dev0" # pragma: no cover diff --git a/unstructured/logger.py b/unstructured/logger.py index 2a720b671..b1f7c11ed 100644 --- a/unstructured/logger.py +++ b/unstructured/logger.py @@ -1,3 +1,18 @@ import logging logger = logging.getLogger("unstructured") +trace_logger = logging.getLogger("unstructured.trace") + +# Create a custom logging level +DETAIL = 15 +logging.addLevelName(DETAIL, "DETAIL") + + +# Create a custom log method for the "DETAIL" level +def detail(self, message, *args, **kws): + if self.isEnabledFor(DETAIL): + self._log(DETAIL, message, args, **kws) + + +# Add the custom log method to the logging.Logger class +logging.Logger.detail = detail # type: ignore diff --git a/unstructured/partition/text_type.py b/unstructured/partition/text_type.py index 436af6956..fa606f87b 100644 --- a/unstructured/partition/text_type.py +++ b/unstructured/partition/text_type.py @@ -10,7 +10,7 @@ else: from typing import Final from unstructured.cleaners.core import remove_punctuation -from unstructured.logger import logger +from unstructured.logger import trace_logger from unstructured.nlp.english_words import ENGLISH_WORDS from unstructured.nlp.patterns import ( ENDS_IN_PUNCT_RE, @@ -57,11 +57,11 @@ def is_possible_narrative_text( language_checks = _language_checks.lower() == "true" if len(text) == 0: - logger.debug("Not narrative. Text is empty.") + trace_logger.detail("Not narrative. Text is empty.") # type: ignore return False if text.isnumeric(): - logger.debug(f"Not narrative. Text is all numeric:\n\n{text}") + trace_logger.detail(f"Not narrative. Text is all numeric:\n\n{text}") # type: ignore return False language = os.environ.get("UNSTRUCTURED_LANGUAGE", language) @@ -74,7 +74,7 @@ def is_possible_narrative_text( os.environ.get("UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD", cap_threshold), ) if exceeds_cap_ratio(text, threshold=cap_threshold): - logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}") + trace_logger.detail(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}") # type: ignore # noqa: E501 return False non_alpha_threshold = float( @@ -84,7 +84,7 @@ def is_possible_narrative_text( return False if (sentence_count(text, 3) < 2) and (not contains_verb(text)) and language == "en": - logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}") + trace_logger.detail(f"Not narrative. Text does not contain a verb:\n\n{text}") # type: ignore # noqa: E501 return False return True @@ -121,7 +121,7 @@ def is_possible_title( language_checks = _language_checks.lower() == "true" if len(text) == 0: - logger.debug("Not a title. Text is empty.") + trace_logger.detail("Not a title. Text is empty.") # type: ignore return False if text.isupper() and ENDS_IN_PUNCT_RE.search(text) is not None: @@ -150,14 +150,16 @@ def is_possible_title( return False if text.isnumeric(): - logger.debug(f"Not a title. Text is all numeric:\n\n{text}") + trace_logger.detail(f"Not a title. Text is all numeric:\n\n{text}") # type: ignore return False # NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS" # that sometimes get tokenized as separate sentences due to the period, but are still # valid titles if sentence_count(text, min_length=sentence_min_length) > 1: - logger.debug(f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}") + trace_logger.detail( # type: ignore + f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}", + ) return False return True @@ -223,7 +225,7 @@ def sentence_count(text: str, min_length: Optional[int] = None) -> int: sentence = remove_punctuation(sentence) words = [word for word in word_tokenize(sentence) if word != "."] if min_length and len(words) < min_length: - logger.debug( + trace_logger.detail( # type: ignore f"Skipping sentence because does not exceed {min_length} word tokens\n" f"{sentence}", )