Chore: Add a trace logger for NLP output (#561)

* add and config trace logger * chore: update loggers in partition * doc: changelog and version * doc: update changelog * doc: remove placeholder * chore: bypass mypy
2025-10-14 17:43:20 +00:00 · 2023-05-10 12:16:15 -04:00 · 2023-05-10 12:16:15 -04:00 · 0f91a9bfa8
commit 0f91a9bfa8
parent b52638f8e3
4 changed files with 37 additions and 10 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,13 @@
 ## 0.6.6-dev0
 ### Enhancements
 * Added an additional trace logger for NLP debugging.
 ### Features
 ### Fixes
 ## 0.6.5
 ### Enhancements
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.6.5"  # pragma: no cover
+__version__ = "0.6.6-dev0"  # pragma: no cover
--- a/unstructured/logger.py
+++ b/unstructured/logger.py
@ -1,3 +1,18 @@
 import logging
 logger = logging.getLogger("unstructured")
 trace_logger = logging.getLogger("unstructured.trace")
 # Create a custom logging level
 DETAIL = 15
 logging.addLevelName(DETAIL, "DETAIL")
 # Create a custom log method for the "DETAIL" level
 def detail(self, message, *args, **kws):
    if self.isEnabledFor(DETAIL):
        self._log(DETAIL, message, args, **kws)
 # Add the custom log method to the logging.Logger class
 logging.Logger.detail = detail  # type: ignore
--- a/unstructured/partition/text_type.py
+++ b/unstructured/partition/text_type.py
@ -10,7 +10,7 @@ else:
    from typing import Final
 from unstructured.cleaners.core import remove_punctuation
-from unstructured.logger import logger
+from unstructured.logger import trace_logger
 from unstructured.nlp.english_words import ENGLISH_WORDS
 from unstructured.nlp.patterns import (
    ENDS_IN_PUNCT_RE,
@ -57,11 +57,11 @@ def is_possible_narrative_text(
        language_checks = _language_checks.lower() == "true"
    if len(text) == 0:
-        logger.debug("Not narrative. Text is empty.")
+        trace_logger.detail("Not narrative. Text is empty.")  # type: ignore
        return False
    if text.isnumeric():
-        logger.debug(f"Not narrative. Text is all numeric:\n\n{text}")
+        trace_logger.detail(f"Not narrative. Text is all numeric:\n\n{text}")  # type: ignore
        return False
    language = os.environ.get("UNSTRUCTURED_LANGUAGE", language)
@ -74,7 +74,7 @@ def is_possible_narrative_text(
        os.environ.get("UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD", cap_threshold),
    )
    if exceeds_cap_ratio(text, threshold=cap_threshold):
-        logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
+        trace_logger.detail(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")  # type: ignore # noqa: E501
        return False
    non_alpha_threshold = float(
@ -84,7 +84,7 @@ def is_possible_narrative_text(
        return False
    if (sentence_count(text, 3) < 2) and (not contains_verb(text)) and language == "en":
-        logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
+        trace_logger.detail(f"Not narrative. Text does not contain a verb:\n\n{text}")  # type: ignore # noqa: E501
        return False
    return True
@ -121,7 +121,7 @@ def is_possible_title(
        language_checks = _language_checks.lower() == "true"
    if len(text) == 0:
-        logger.debug("Not a title. Text is empty.")
+        trace_logger.detail("Not a title. Text is empty.")  # type: ignore
        return False
    if text.isupper() and ENDS_IN_PUNCT_RE.search(text) is not None:
@ -150,14 +150,16 @@ def is_possible_title(
        return False
    if text.isnumeric():
-        logger.debug(f"Not a title. Text is all numeric:\n\n{text}")
+        trace_logger.detail(f"Not a title. Text is all numeric:\n\n{text}")  # type: ignore
        return False
    # NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS"
    # that sometimes get tokenized as separate sentences due to the period, but are still
    # valid titles
    if sentence_count(text, min_length=sentence_min_length) > 1:
-        logger.debug(f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}")
+        trace_logger.detail(  # type: ignore
            f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}",
        )
        return False
    return True
@ -223,7 +225,7 @@ def sentence_count(text: str, min_length: Optional[int] = None) -> int:
        sentence = remove_punctuation(sentence)
        words = [word for word in word_tokenize(sentence) if word != "."]
        if min_length and len(words) < min_length:
-            logger.debug(
+            trace_logger.detail(  # type: ignore
                f"Skipping sentence because does not exceed {min_length} word tokens\n"
                f"{sentence}",
            )
`@ -1 +1 @@`
	`__version__ = "0.6.5" # pragma: no cover`	`__version__ = "0.6.6-dev0" # pragma: no cover`