From 0f91a9bfa8fded63ba43b4b4125fcf30a011d3d5 Mon Sep 17 00:00:00 2001
From: Yuming Long <63475068+yuming-long@users.noreply.github.com>
Date: Wed, 10 May 2023 12:16:15 -0400
Subject: [PATCH] Chore: Add a trace logger for NLP output (#561)

* add and config trace logger

* chore: update loggers in partition

* doc: changelog and version

* doc: update changelog

* doc: remove placeholder

* chore: bypass mypy
---
 CHANGELOG.md                        | 10 ++++++++++
 unstructured/__version__.py         |  2 +-
 unstructured/logger.py              | 15 +++++++++++++++
 unstructured/partition/text_type.py | 20 +++++++++++---------
 4 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ce62c2e75..f7e5c03a4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## 0.6.6-dev0
+
+### Enhancements
+
+* Added an additional trace logger for NLP debugging.
+
+### Features
+
+### Fixes
+
 ## 0.6.5
 
 ### Enhancements
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 36225a4ca..d7d6a9bd3 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.6.5"  # pragma: no cover
+__version__ = "0.6.6-dev0"  # pragma: no cover
diff --git a/unstructured/logger.py b/unstructured/logger.py
index 2a720b671..b1f7c11ed 100644
--- a/unstructured/logger.py
+++ b/unstructured/logger.py
@@ -1,3 +1,18 @@
 import logging
 
 logger = logging.getLogger("unstructured")
+trace_logger = logging.getLogger("unstructured.trace")
+
+# Create a custom logging level
+DETAIL = 15
+logging.addLevelName(DETAIL, "DETAIL")
+
+
+# Create a custom log method for the "DETAIL" level
+def detail(self, message, *args, **kws):
+    if self.isEnabledFor(DETAIL):
+        self._log(DETAIL, message, args, **kws)
+
+
+# Add the custom log method to the logging.Logger class
+logging.Logger.detail = detail  # type: ignore
diff --git a/unstructured/partition/text_type.py b/unstructured/partition/text_type.py
index 436af6956..fa606f87b 100644
--- a/unstructured/partition/text_type.py
+++ b/unstructured/partition/text_type.py
@@ -10,7 +10,7 @@ else:
     from typing import Final
 
 from unstructured.cleaners.core import remove_punctuation
-from unstructured.logger import logger
+from unstructured.logger import trace_logger
 from unstructured.nlp.english_words import ENGLISH_WORDS
 from unstructured.nlp.patterns import (
     ENDS_IN_PUNCT_RE,
@@ -57,11 +57,11 @@ def is_possible_narrative_text(
         language_checks = _language_checks.lower() == "true"
 
     if len(text) == 0:
-        logger.debug("Not narrative. Text is empty.")
+        trace_logger.detail("Not narrative. Text is empty.")  # type: ignore
         return False
 
     if text.isnumeric():
-        logger.debug(f"Not narrative. Text is all numeric:\n\n{text}")
+        trace_logger.detail(f"Not narrative. Text is all numeric:\n\n{text}")  # type: ignore
         return False
 
     language = os.environ.get("UNSTRUCTURED_LANGUAGE", language)
@@ -74,7 +74,7 @@ def is_possible_narrative_text(
         os.environ.get("UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD", cap_threshold),
     )
     if exceeds_cap_ratio(text, threshold=cap_threshold):
-        logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
+        trace_logger.detail(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")  # type: ignore # noqa: E501
         return False
 
     non_alpha_threshold = float(
@@ -84,7 +84,7 @@ def is_possible_narrative_text(
         return False
 
     if (sentence_count(text, 3) < 2) and (not contains_verb(text)) and language == "en":
-        logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
+        trace_logger.detail(f"Not narrative. Text does not contain a verb:\n\n{text}")  # type: ignore # noqa: E501
         return False
 
     return True
@@ -121,7 +121,7 @@ def is_possible_title(
         language_checks = _language_checks.lower() == "true"
 
     if len(text) == 0:
-        logger.debug("Not a title. Text is empty.")
+        trace_logger.detail("Not a title. Text is empty.")  # type: ignore
         return False
 
     if text.isupper() and ENDS_IN_PUNCT_RE.search(text) is not None:
@@ -150,14 +150,16 @@ def is_possible_title(
         return False
 
     if text.isnumeric():
-        logger.debug(f"Not a title. Text is all numeric:\n\n{text}")
+        trace_logger.detail(f"Not a title. Text is all numeric:\n\n{text}")  # type: ignore
         return False
 
     # NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS"
     # that sometimes get tokenized as separate sentences due to the period, but are still
     # valid titles
     if sentence_count(text, min_length=sentence_min_length) > 1:
-        logger.debug(f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}")
+        trace_logger.detail(  # type: ignore
+            f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}",
+        )
         return False
 
     return True
@@ -223,7 +225,7 @@ def sentence_count(text: str, min_length: Optional[int] = None) -> int:
         sentence = remove_punctuation(sentence)
         words = [word for word in word_tokenize(sentence) if word != "."]
         if min_length and len(words) < min_length:
-            logger.debug(
+            trace_logger.detail(  # type: ignore
                 f"Skipping sentence because does not exceed {min_length} word tokens\n"
                 f"{sentence}",
             )