mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-03 19:43:24 +00:00 
			
		
		
		
	fix: use raw strings for regex patterns (#3029)
**Summary** Avoid `SyntaxWarning` and/or `SyntaxError` messages when importing `unstructured.nlp.patterns` by using raw strings (`"r"` prefix) for regex patterns which may contain `\x` character sequences not recognized by the Python parser for normal strings. Fixes: #2495
This commit is contained in:
		
							parent
							
								
									e6ada05c55
								
							
						
					
					
						commit
						0de9215db4
					
				@ -1,4 +1,4 @@
 | 
				
			|||||||
## 0.13.8-dev13
 | 
					## 0.13.8-dev14
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Enhancements
 | 
					### Enhancements
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -19,6 +19,7 @@
 | 
				
			|||||||
* **Remove links param from partition_pdf** `links` is extracted during partitioning and is not needed as a paramter in partition_pdf.
 | 
					* **Remove links param from partition_pdf** `links` is extracted during partitioning and is not needed as a paramter in partition_pdf.
 | 
				
			||||||
* **Improve CSV delimeter detection.** `partition_csv()` would raise on CSV files with very long lines.
 | 
					* **Improve CSV delimeter detection.** `partition_csv()` would raise on CSV files with very long lines.
 | 
				
			||||||
* **Fix disk-space leak in `partition_doc()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_doc()`.
 | 
					* **Fix disk-space leak in `partition_doc()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_doc()`.
 | 
				
			||||||
 | 
					* **Fix possible `SyntaxError` or `SyntaxWarning` on regex patterns.** Change regex patterns to raw strings to avoid these warnings/errors in Python 3.11+.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## 0.13.7
 | 
					## 0.13.7
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -27,6 +27,7 @@ lint.select = [
 | 
				
			|||||||
    "UP018",    # -- Unnecessary {literal_type} call like `str("abc")`. (rewrite as a literal) --
 | 
					    "UP018",    # -- Unnecessary {literal_type} call like `str("abc")`. (rewrite as a literal) --
 | 
				
			||||||
    "UP032",    # -- Use f-string instead of `.format()` call --
 | 
					    "UP032",    # -- Use f-string instead of `.format()` call --
 | 
				
			||||||
    "UP034",    # -- Avoid extraneous parentheses --
 | 
					    "UP034",    # -- Avoid extraneous parentheses --
 | 
				
			||||||
 | 
					    "W",        # -- Warnings, including invalid escape-sequence --
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
lint.ignore = [
 | 
					lint.ignore = [
 | 
				
			||||||
    "COM812",   # -- over aggressively insists on trailing commas where not desireable --
 | 
					    "COM812",   # -- over aggressively insists on trailing commas where not desireable --
 | 
				
			||||||
 | 
				
			|||||||
@ -1 +1 @@
 | 
				
			|||||||
__version__ = "0.13.8-dev13"  # pragma: no cover
 | 
					__version__ = "0.13.8-dev14"  # pragma: no cover
 | 
				
			||||||
 | 
				
			|||||||
@ -53,7 +53,7 @@ UNICODE_BULLETS: Final[List[str]] = [
 | 
				
			|||||||
    "\u29BF",
 | 
					    "\u29BF",
 | 
				
			||||||
    "\u002D",
 | 
					    "\u002D",
 | 
				
			||||||
    "",
 | 
					    "",
 | 
				
			||||||
    "\*",  # noqa: W605 NOTE(robinson) - skipping qa because we need the escape for the regex
 | 
					    r"\*",
 | 
				
			||||||
    "\x95",
 | 
					    "\x95",
 | 
				
			||||||
    "·",
 | 
					    "·",
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
@ -76,7 +76,7 @@ EMAIL_HEAD_RE = re.compile(EMAIL_HEAD_PATTERN)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
# Helps split text by paragraphs. There must be one newline, with potential whitespace
 | 
					# Helps split text by paragraphs. There must be one newline, with potential whitespace
 | 
				
			||||||
# (incluing \r and \n chars) on either side
 | 
					# (incluing \r and \n chars) on either side
 | 
				
			||||||
PARAGRAPH_PATTERN = r"\s*\n\s*"  # noqa: W605 NOTE(harrell)
 | 
					PARAGRAPH_PATTERN = r"\s*\n\s*"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
PARAGRAPH_PATTERN_RE = re.compile(
 | 
					PARAGRAPH_PATTERN_RE = re.compile(
 | 
				
			||||||
    f"((?:{BULLETS_PATTERN})|{PARAGRAPH_PATTERN})(?!{BULLETS_PATTERN}|$)",
 | 
					    f"((?:{BULLETS_PATTERN})|{PARAGRAPH_PATTERN})(?!{BULLETS_PATTERN}|$)",
 | 
				
			||||||
@ -94,28 +94,23 @@ ONE_LINE_BREAK_PARAGRAPH_PATTERN_RE = re.compile(ONE_LINE_BREAK_PARAGRAPH_PATTER
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
# IP Address examples: ba23::58b5:2236:45g2:88h2 or 10.0.2.01
 | 
					# IP Address examples: ba23::58b5:2236:45g2:88h2 or 10.0.2.01
 | 
				
			||||||
IP_ADDRESS_PATTERN = (
 | 
					IP_ADDRESS_PATTERN = (
 | 
				
			||||||
    "[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}",  # noqa: W605 NOTE(harrell)
 | 
					    r"[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}",
 | 
				
			||||||
    # - skipping qa because we need the escape for the regex
 | 
					 | 
				
			||||||
    "[a-z0-9]{4}::[a-z0-9]{4}:[a-z0-9]{4}:[a-z0-9]{4}:[a-z0-9]{4}%?[0-9]*",
 | 
					    "[a-z0-9]{4}::[a-z0-9]{4}:[a-z0-9]{4}:[a-z0-9]{4}:[a-z0-9]{4}%?[0-9]*",
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
IP_ADDRESS_PATTERN_RE = re.compile(f"({'|'.join(IP_ADDRESS_PATTERN)})")
 | 
					IP_ADDRESS_PATTERN_RE = re.compile(f"({'|'.join(IP_ADDRESS_PATTERN)})")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
IP_ADDRESS_NAME_PATTERN = "[a-zA-Z0-9-]*\.[a-zA-Z]*\.[a-zA-Z]*"  # noqa: W605 NOTE(harrell)
 | 
					IP_ADDRESS_NAME_PATTERN = r"[a-zA-Z0-9-]*\.[a-zA-Z]*\.[a-zA-Z]*"
 | 
				
			||||||
# - skipping qa because we need the escape for the regex
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Mapi ID example: 32.88.5467.123
 | 
					# Mapi ID example: 32.88.5467.123
 | 
				
			||||||
MAPI_ID_PATTERN = "[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*;"  # noqa: W605 NOTE(harrell)
 | 
					MAPI_ID_PATTERN = r"[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*;"
 | 
				
			||||||
# - skipping qa because we need the escape for the regex
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Date, time, timezone example: Fri, 26 Mar 2021 11:04:09 +1200
 | 
					# Date, time, timezone example: Fri, 26 Mar 2021 11:04:09 +1200
 | 
				
			||||||
# NOTE(harrell) - skipping qa because we need the escape for the regex
 | 
					 | 
				
			||||||
EMAIL_DATETIMETZ_PATTERN = (
 | 
					EMAIL_DATETIMETZ_PATTERN = (
 | 
				
			||||||
    r"[A-Za-z]{3},\s\d{1,2}\s[A-Za-z]{3}\s\d{4}\s\d{2}:\d{2}:\d{2}\s[+-]\d{4}"  # noqa: W605,E501
 | 
					    r"[A-Za-z]{3},\s\d{1,2}\s[A-Za-z]{3}\s\d{4}\s\d{2}:\d{2}:\d{2}\s[+-]\d{4}"
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
EMAIL_DATETIMETZ_PATTERN_RE = re.compile(EMAIL_DATETIMETZ_PATTERN)
 | 
					EMAIL_DATETIMETZ_PATTERN_RE = re.compile(EMAIL_DATETIMETZ_PATTERN)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
EMAIL_ADDRESS_PATTERN = "[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"  # noqa: W605 NOTE(harrell)
 | 
					EMAIL_ADDRESS_PATTERN = r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"
 | 
				
			||||||
# - skipping qa because we need the escape for the regex
 | 
					 | 
				
			||||||
EMAIL_ADDRESS_PATTERN_RE = re.compile(EMAIL_ADDRESS_PATTERN)
 | 
					EMAIL_ADDRESS_PATTERN_RE = re.compile(EMAIL_ADDRESS_PATTERN)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
 | 
					ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
 | 
				
			||||||
 | 
				
			|||||||
@ -91,7 +91,7 @@ def _parse_received_data(data: str) -> list[Element]:
 | 
				
			|||||||
def _parse_email_address(data: str) -> tuple[str, str]:
 | 
					def _parse_email_address(data: str) -> tuple[str, str]:
 | 
				
			||||||
    email_address = extract_email_address(data)
 | 
					    email_address = extract_email_address(data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    PATTERN = "<[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+>"  # noqa: W605 Note(harrell)
 | 
					    PATTERN = r"<[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+>"
 | 
				
			||||||
    name = re.split(PATTERN, data.lower())[0].title().strip()
 | 
					    name = re.split(PATTERN, data.lower())[0].title().strip()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return name, email_address[0]
 | 
					    return name, email_address[0]
 | 
				
			||||||
@ -224,7 +224,7 @@ def extract_attachment_info(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def has_embedded_image(element):
 | 
					def has_embedded_image(element):
 | 
				
			||||||
    PATTERN = re.compile("\[image: .+\]")  # noqa: W605 NOTE(harrell)
 | 
					    PATTERN = re.compile(r"\[image: .+\]")
 | 
				
			||||||
    return PATTERN.search(element.text)
 | 
					    return PATTERN.search(element.text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user