mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-14 17:43:20 +00:00
Chore: Add a trace logger for NLP output (#561)
* add and config trace logger * chore: update loggers in partition * doc: changelog and version * doc: update changelog * doc: remove placeholder * chore: bypass mypy
This commit is contained in:
parent
b52638f8e3
commit
0f91a9bfa8
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
|||||||
|
## 0.6.6-dev0
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
* Added an additional trace logger for NLP debugging.
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
## 0.6.5
|
## 0.6.5
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.6.5" # pragma: no cover
|
__version__ = "0.6.6-dev0" # pragma: no cover
|
||||||
|
@ -1,3 +1,18 @@
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
logger = logging.getLogger("unstructured")
|
logger = logging.getLogger("unstructured")
|
||||||
|
trace_logger = logging.getLogger("unstructured.trace")
|
||||||
|
|
||||||
|
# Create a custom logging level
|
||||||
|
DETAIL = 15
|
||||||
|
logging.addLevelName(DETAIL, "DETAIL")
|
||||||
|
|
||||||
|
|
||||||
|
# Create a custom log method for the "DETAIL" level
|
||||||
|
def detail(self, message, *args, **kws):
|
||||||
|
if self.isEnabledFor(DETAIL):
|
||||||
|
self._log(DETAIL, message, args, **kws)
|
||||||
|
|
||||||
|
|
||||||
|
# Add the custom log method to the logging.Logger class
|
||||||
|
logging.Logger.detail = detail # type: ignore
|
||||||
|
@ -10,7 +10,7 @@ else:
|
|||||||
from typing import Final
|
from typing import Final
|
||||||
|
|
||||||
from unstructured.cleaners.core import remove_punctuation
|
from unstructured.cleaners.core import remove_punctuation
|
||||||
from unstructured.logger import logger
|
from unstructured.logger import trace_logger
|
||||||
from unstructured.nlp.english_words import ENGLISH_WORDS
|
from unstructured.nlp.english_words import ENGLISH_WORDS
|
||||||
from unstructured.nlp.patterns import (
|
from unstructured.nlp.patterns import (
|
||||||
ENDS_IN_PUNCT_RE,
|
ENDS_IN_PUNCT_RE,
|
||||||
@ -57,11 +57,11 @@ def is_possible_narrative_text(
|
|||||||
language_checks = _language_checks.lower() == "true"
|
language_checks = _language_checks.lower() == "true"
|
||||||
|
|
||||||
if len(text) == 0:
|
if len(text) == 0:
|
||||||
logger.debug("Not narrative. Text is empty.")
|
trace_logger.detail("Not narrative. Text is empty.") # type: ignore
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if text.isnumeric():
|
if text.isnumeric():
|
||||||
logger.debug(f"Not narrative. Text is all numeric:\n\n{text}")
|
trace_logger.detail(f"Not narrative. Text is all numeric:\n\n{text}") # type: ignore
|
||||||
return False
|
return False
|
||||||
|
|
||||||
language = os.environ.get("UNSTRUCTURED_LANGUAGE", language)
|
language = os.environ.get("UNSTRUCTURED_LANGUAGE", language)
|
||||||
@ -74,7 +74,7 @@ def is_possible_narrative_text(
|
|||||||
os.environ.get("UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD", cap_threshold),
|
os.environ.get("UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD", cap_threshold),
|
||||||
)
|
)
|
||||||
if exceeds_cap_ratio(text, threshold=cap_threshold):
|
if exceeds_cap_ratio(text, threshold=cap_threshold):
|
||||||
logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
|
trace_logger.detail(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}") # type: ignore # noqa: E501
|
||||||
return False
|
return False
|
||||||
|
|
||||||
non_alpha_threshold = float(
|
non_alpha_threshold = float(
|
||||||
@ -84,7 +84,7 @@ def is_possible_narrative_text(
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
if (sentence_count(text, 3) < 2) and (not contains_verb(text)) and language == "en":
|
if (sentence_count(text, 3) < 2) and (not contains_verb(text)) and language == "en":
|
||||||
logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
|
trace_logger.detail(f"Not narrative. Text does not contain a verb:\n\n{text}") # type: ignore # noqa: E501
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
@ -121,7 +121,7 @@ def is_possible_title(
|
|||||||
language_checks = _language_checks.lower() == "true"
|
language_checks = _language_checks.lower() == "true"
|
||||||
|
|
||||||
if len(text) == 0:
|
if len(text) == 0:
|
||||||
logger.debug("Not a title. Text is empty.")
|
trace_logger.detail("Not a title. Text is empty.") # type: ignore
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if text.isupper() and ENDS_IN_PUNCT_RE.search(text) is not None:
|
if text.isupper() and ENDS_IN_PUNCT_RE.search(text) is not None:
|
||||||
@ -150,14 +150,16 @@ def is_possible_title(
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
if text.isnumeric():
|
if text.isnumeric():
|
||||||
logger.debug(f"Not a title. Text is all numeric:\n\n{text}")
|
trace_logger.detail(f"Not a title. Text is all numeric:\n\n{text}") # type: ignore
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS"
|
# NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS"
|
||||||
# that sometimes get tokenized as separate sentences due to the period, but are still
|
# that sometimes get tokenized as separate sentences due to the period, but are still
|
||||||
# valid titles
|
# valid titles
|
||||||
if sentence_count(text, min_length=sentence_min_length) > 1:
|
if sentence_count(text, min_length=sentence_min_length) > 1:
|
||||||
logger.debug(f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}")
|
trace_logger.detail( # type: ignore
|
||||||
|
f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}",
|
||||||
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
@ -223,7 +225,7 @@ def sentence_count(text: str, min_length: Optional[int] = None) -> int:
|
|||||||
sentence = remove_punctuation(sentence)
|
sentence = remove_punctuation(sentence)
|
||||||
words = [word for word in word_tokenize(sentence) if word != "."]
|
words = [word for word in word_tokenize(sentence) if word != "."]
|
||||||
if min_length and len(words) < min_length:
|
if min_length and len(words) < min_length:
|
||||||
logger.debug(
|
trace_logger.detail( # type: ignore
|
||||||
f"Skipping sentence because does not exceed {min_length} word tokens\n"
|
f"Skipping sentence because does not exceed {min_length} word tokens\n"
|
||||||
f"{sentence}",
|
f"{sentence}",
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user