From 1e2da6df46d170ae9d3c8ad1f3419932ddf191f0 Mon Sep 17 00:00:00 2001 From: Magnus F Date: Mon, 9 Dec 2024 23:19:13 +0100 Subject: [PATCH] fix: ipv4 address regex (#3808) I noticed the ipv4 regex is wrong (it only capture one or two-digit octets, e.g. `n.nn.n.nn`). Here's a correction and a bumped test for it. If you wish I can break out the ipv4 test to its own case, so we don't interfere with the existing `EMAIL_META_DATA_INPUT` ipv6 extraction test. Side note: The comment at `unstructured/nlp/patterns.py#95` includes a bad ipv4 address example (last octet is wrongfully left-padded with a zero). I left it as it is because I'm not sure if the intention is to include "non-conventional" ipv4 addresses, like octal or hexadecimal octets. --- CHANGELOG.md | 4 ++++ test_unstructured/cleaners/test_extract.py | 4 ++-- unstructured/nlp/patterns.py | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 323f6484e..955a0d4ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ ## 0.16.11-dev1 +### Fixes + +- Fix ipv4 regex to correctly include up to three digit octets. + ### Enhancements - **Enhance quote standardization tests** with additional Unicode scenarios diff --git a/test_unstructured/cleaners/test_extract.py b/test_unstructured/cleaners/test_extract.py index 6ca059883..86ac0e848 100644 --- a/test_unstructured/cleaners/test_extract.py +++ b/test_unstructured/cleaners/test_extract.py @@ -5,7 +5,7 @@ import pytest from unstructured.cleaners import extract EMAIL_META_DATA_INPUT = """from ABC.DEF.local ([ba23::58b5:2236:45g2:88h2]) by - \n ABC.DEF.local ([ba23::58b5:2236:45g2:88h2%25]) with mapi id\ + \n ABC.DEF.local ([68.183.71.12]) with mapi id\ n 32.88.5467.123; Fri, 26 Mar 2021 11:04:09 +1200""" @@ -37,7 +37,7 @@ def test_extract_email_address(): def test_extract_ip_address(): assert extract.extract_ip_address(EMAIL_META_DATA_INPUT) == [ "ba23::58b5:2236:45g2:88h2", - "ba23::58b5:2236:45g2:88h2%25", + "68.183.71.12", ] diff --git a/unstructured/nlp/patterns.py b/unstructured/nlp/patterns.py index e18f067c4..b3a77f77a 100644 --- a/unstructured/nlp/patterns.py +++ b/unstructured/nlp/patterns.py @@ -92,9 +92,9 @@ LINE_BREAK_RE = re.compile(LINE_BREAK) ONE_LINE_BREAK_PARAGRAPH_PATTERN = r"^(?:(?!\.\s*$).)*$" ONE_LINE_BREAK_PARAGRAPH_PATTERN_RE = re.compile(ONE_LINE_BREAK_PARAGRAPH_PATTERN) -# IP Address examples: ba23::58b5:2236:45g2:88h2 or 10.0.2.01 +# IP Address examples: ba23::58b5:2236:45g2:88h2, 10.0.2.01 or 68.183.71.12 IP_ADDRESS_PATTERN = ( - r"[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}", + r"(?:25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)){3}", "[a-z0-9]{4}::[a-z0-9]{4}:[a-z0-9]{4}:[a-z0-9]{4}:[a-z0-9]{4}%?[0-9]*", ) IP_ADDRESS_PATTERN_RE = re.compile(f"({'|'.join(IP_ADDRESS_PATTERN)})")