From 3467a2786d2d1e4f39bc19394e7effc54532d6cd Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Tue, 21 Mar 2023 23:58:18 -0700 Subject: [PATCH] Update patterns.py (#391) --- unstructured/nlp/patterns.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unstructured/nlp/patterns.py b/unstructured/nlp/patterns.py index 2607f51ab..41fbb7b91 100644 --- a/unstructured/nlp/patterns.py +++ b/unstructured/nlp/patterns.py @@ -66,8 +66,9 @@ EMAIL_HEAD_PATTERN = ( ) EMAIL_HEAD_RE = re.compile(EMAIL_HEAD_PATTERN) -# Helps split text by paragraphs -PARAGRAPH_PATTERN = "\n\n\n|\n\n|\r\n|\r|\n" # noqa: W605 NOTE(harrell) +# Helps split text by paragraphs. There must be one newline, with potential whitespace +# (incluing \r and \n chars) on either side +PARAGRAPH_PATTERN = r"\s*\n\s*" # noqa: W605 NOTE(harrell) # IP Address examples: ba23::58b5:2236:45g2:88h2 or 10.0.2.01 IP_ADDRESS_PATTERN = (