feat: adding RegextTextExtractor component from experimental (#9879)

* initial import of component

* adding release notes

* adding docs to docusaurus
This commit is contained in:
David S. Batista 2025-10-15 13:55:22 +02:00 committed by GitHub
parent fe60c765d9
commit cfa5d27614
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 331 additions and 6 deletions

View File

@ -1,7 +1,12 @@
loaders:
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
search_path: [../../../haystack/components/extractors]
modules: ["named_entity_extractor", "llm_metadata_extractor", "image/llm_document_content_extractor"]
modules: [
"named_entity_extractor",
"llm_metadata_extractor",
"image/llm_document_content_extractor",
"regex_text_extractor",
]
ignore_when_discovered: ["__init__"]
processors:
- type: filter
@ -15,7 +20,7 @@ processors:
- type: crossref
renderer:
type: haystack_pydoc_tools.renderers.ReadmeCoreRenderer
excerpt: Extracts predefined entities out of a piece of text.
excerpt: Components to extract specific elements from textual data.
category_slug: haystack-api
title: Extractors
slug: extractors-api

View File

@ -1,7 +1,12 @@
loaders:
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
search_path: [../../../haystack/components/extractors]
modules: ["named_entity_extractor", "llm_metadata_extractor", "image/llm_document_content_extractor"]
modules: [
"named_entity_extractor",
"llm_metadata_extractor",
"image/llm_document_content_extractor",
"regex_text_extractor",
]
ignore_when_discovered: ["__init__"]
processors:
- type: filter
@ -14,10 +19,12 @@ processors:
- type: smart
- type: crossref
renderer:
type: haystack_pydoc_tools.renderers.DocusaurusRenderer
description: Extracts predefined entities out of a piece of text.
type: haystack_pydoc_tools.renderers.ReadmeCoreRenderer
excerpt: Components to extract specific elements from textual data.
category_slug: haystack-api
title: Extractors
id: extractors-api
slug: extractors-api
order: 65
markdown:
descriptive_class_title: false
classdef_code_block: false

View File

@ -9,6 +9,7 @@ from lazy_imports import LazyImporter
_import_structure = {
"llm_metadata_extractor": ["LLMMetadataExtractor"],
"regex_text_extractor": ["RegexTextExtractor"],
"named_entity_extractor": ["NamedEntityAnnotation", "NamedEntityExtractor", "NamedEntityExtractorBackend"],
}
@ -17,6 +18,7 @@ if TYPE_CHECKING:
from .named_entity_extractor import NamedEntityAnnotation as NamedEntityAnnotation
from .named_entity_extractor import NamedEntityExtractor as NamedEntityExtractor
from .named_entity_extractor import NamedEntityExtractorBackend as NamedEntityExtractorBackend
from .regex_text_extractor import RegexTextExtractor
else:
sys.modules[__name__] = LazyImporter(name=__name__, module_file=__file__, import_structure=_import_structure)

View File

@ -0,0 +1,117 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
import re
from typing import Union
from haystack import component, logging
from haystack.dataclasses import ChatMessage
logger = logging.getLogger(__name__)
@component
class RegexTextExtractor:
"""
Extracts text from chat message or string input using a regex pattern.
RegexTextExtractor parses input text or ChatMessages using a provided regular expression pattern.
It can be configured to search through all messages or only the last message in a list of ChatMessages.
### Usage example
```python
from haystack_experimental.components.extractors import RegexTextExtractor
from haystack.dataclasses import ChatMessage
# Using with a string
parser = RegexTextExtractor(regex_pattern='<issue url=\"(.+)\">')
result = parser.run(text_or_messages='<issue url="github.com/hahahaha">hahahah</issue>')
# result: {"captured_text": "github.com/hahahaha"}
# Using with ChatMessages
messages = [ChatMessage.from_user('<issue url="github.com/hahahaha">hahahah</issue>')]
result = parser.run(text_or_messages=messages)
# result: {"captured_text": "github.com/hahahaha"}
```
"""
def __init__(self, regex_pattern: str):
"""
Creates an instance of the RegexTextExtractor component.
:param regex_pattern:
The regular expression pattern used to extract text.
The pattern should include a capture group to extract the desired text.
Example: '<issue url=\"(.+)\">' captures 'github.com/hahahaha' from '<issue url="github.com/hahahaha">'.
"""
self.regex_pattern = regex_pattern
# Check if the pattern has at least one capture group
num_groups = re.compile(regex_pattern).groups
if num_groups < 1:
logger.warning(
"The provided regex pattern {regex_pattern} doesn't contain any capture groups. "
"The entire match will be returned instead.",
regex_pattern=regex_pattern,
)
@component.output_types(captured_text=str, captured_texts=list[str])
def run(self, text_or_messages: Union[str, list[ChatMessage]]) -> dict:
"""
Extracts text from input using the configured regex pattern.
:param text_or_messages:
Either a string or a list of ChatMessage objects to search through.
:returns:
- If match found: {"captured_text": "matched text"}
- If no match and return_empty_on_no_match=True: {}
:raises:
- ValueError: if receiving a list the last element is not a ChatMessage instance.
"""
if isinstance(text_or_messages, str):
return RegexTextExtractor._build_result(self._extract_from_text(text_or_messages))
if not text_or_messages:
logger.warning("Received empty list of messages")
return {}
return self._process_last_message(text_or_messages)
@staticmethod
def _build_result(result: Union[str, list[str]]) -> dict:
"""Helper method to build the return dictionary based on configuration."""
if (isinstance(result, str) and result == "") or (isinstance(result, list) and not result):
return {}
return {"captured_text": result}
def _process_last_message(self, messages: list[ChatMessage]) -> dict:
"""Process only the last message and build the result."""
last_message = messages[-1]
if not isinstance(last_message, ChatMessage):
raise ValueError(f"Expected ChatMessage object, got {type(last_message)}")
if last_message.text is None:
logger.warning("Last message has no text content")
return {}
result = self._extract_from_text(last_message.text)
return RegexTextExtractor._build_result(result)
def _extract_from_text(self, text: str) -> Union[str, list[str]]:
"""
Extract text using the regex pattern.
:param text:
The text to search through.
:returns:
The text captured by the first capturing group in the regex pattern.
If the pattern has no capture groups, returns the entire match.
If no match is found, returns an empty string.
"""
match = re.search(self.regex_pattern, text)
if not match:
return ""
if match.groups():
return match.group(1)
return match.group(0)

View File

@ -0,0 +1,4 @@
---
features:
- |
A new component `RegexTextExtractor` which allows to extract text from chat messages or strings input based on custom regex pattern.

View File

@ -0,0 +1,190 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
import pytest
from haystack import Pipeline
from haystack.components.extractors.regex_text_extractor import RegexTextExtractor
from haystack.dataclasses import ChatMessage
class TestRegexTextExtractor:
def test_init_with_capture_group(self):
pattern = r'<issue url="(.+?)">'
extractor = RegexTextExtractor(regex_pattern=pattern)
assert extractor.regex_pattern == pattern
def test_init_without_capture_group(self):
pattern = r"<issue>"
extractor = RegexTextExtractor(regex_pattern=pattern)
assert extractor.regex_pattern == pattern
def test_extract_from_string_with_capture_group(self):
pattern = r'<issue url="(.+?)">'
extractor = RegexTextExtractor(regex_pattern=pattern)
text = '<issue url="github.com/hahahaha">hahahah</issue>'
result = extractor.run(text_or_messages=text)
assert result == {"captured_text": "github.com/hahahaha"}
def test_extract_from_string_without_capture_group(self):
pattern = r"<issue>"
extractor = RegexTextExtractor(regex_pattern=pattern)
text = "This is an <issue> tag in the text"
result = extractor.run(text_or_messages=text)
assert result == {"captured_text": "<issue>"}
def test_extract_from_string_no_match(self):
pattern = r'<issue url="(.+?)">'
extractor = RegexTextExtractor(regex_pattern=pattern)
text = "This text has no matching pattern"
result = extractor.run(text_or_messages=text)
assert result == {}
def test_extract_from_string_empty_input(self):
pattern = r'<issue url="(.+?)">'
extractor = RegexTextExtractor(regex_pattern=pattern)
text = ""
result = extractor.run(text_or_messages=text)
assert result == {}
def test_extract_from_chat_messages_single_message(self):
pattern = r'<issue url="(.+?)">'
extractor = RegexTextExtractor(regex_pattern=pattern)
messages = [ChatMessage.from_user('<issue url="github.com/test">test issue</issue>')]
result = extractor.run(text_or_messages=messages)
assert result == {"captured_text": "github.com/test"}
def test_extract_from_chat_messages_multiple_messages(self):
pattern = r'<issue url="(.+?)">'
extractor = RegexTextExtractor(regex_pattern=pattern)
messages = [
ChatMessage.from_user('First message with <issue url="first.com">first</issue>'),
ChatMessage.from_user('Second message with <issue url="second.com">second</issue>'),
ChatMessage.from_user('Last message with <issue url="last.com">last</issue>'),
]
result = extractor.run(text_or_messages=messages)
assert result == {"captured_text": "last.com"}
def test_extract_from_chat_messages_no_match_in_last(self):
pattern = r'<issue url="(.+?)">'
extractor = RegexTextExtractor(regex_pattern=pattern)
messages = [
ChatMessage.from_user('First message with <issue url="first.com">first</issue>'),
ChatMessage.from_user("Last message with no matching pattern"),
]
result = extractor.run(text_or_messages=messages)
assert result == {}
def test_extract_from_chat_messages_empty_list(self):
pattern = r'<issue url="(.+?)">'
extractor = RegexTextExtractor(regex_pattern=pattern)
messages = []
result = extractor.run(text_or_messages=messages)
assert result == {}
def test_extract_from_chat_messages_invalid_type(self):
pattern = r'<issue url="(.+?)">'
extractor = RegexTextExtractor(regex_pattern=pattern)
messages = ["not a ChatMessage object"]
with pytest.raises(ValueError, match="Expected ChatMessage object, got <class 'str'>"):
extractor.run(text_or_messages=messages)
def test_multiple_capture_groups(self):
pattern = r"(\w+)@(\w+)\.(\w+)"
extractor = RegexTextExtractor(regex_pattern=pattern)
text = "Contact us at user@example.com for support"
result = extractor.run(text_or_messages=text)
# return the first capture group (username)
assert result == {"captured_text": "user"}
def test_special_characters_in_pattern(self):
"""Test regex pattern with special characters."""
pattern = r"\[(\w+)\]"
extractor = RegexTextExtractor(regex_pattern=pattern)
text = "This has [special] characters [in] brackets"
result = extractor.run(text_or_messages=text)
assert result == {"captured_text": "special"}
def test_whitespace_handling(self):
"""Test regex pattern with whitespace handling."""
pattern = r"\s+(\w+)\s+"
extractor = RegexTextExtractor(regex_pattern=pattern)
text = "word1 word2 word3"
result = extractor.run(text_or_messages=text)
assert result == {"captured_text": "word2"}
def test_nested_capture_groups(self):
"""Test regex with nested capture groups."""
pattern = r'<(\w+)\s+attr="([^"]+)">'
extractor = RegexTextExtractor(regex_pattern=pattern)
text = '<div attr="value">content</div>'
result = extractor.run(text_or_messages=text)
# Should return the first capture group (tag name)
assert result == {"captured_text": "div"}
def test_optional_capture_group(self):
"""Test regex with optional capture group."""
pattern = r"(\w+)(?:@(\w+))?"
extractor = RegexTextExtractor(regex_pattern=pattern)
text = "username@domain"
result = extractor.run(text_or_messages=text)
assert result == {"captured_text": "username"}
def test_optional_capture_group_no_match(self):
"""Test regex with optional capture group when optional part is missing."""
pattern = r"(\w+)(?:@(\w+))?"
extractor = RegexTextExtractor(regex_pattern=pattern)
text = "username"
result = extractor.run(text_or_messages=text)
assert result == {"captured_text": "username"}
def test_pipeline_integration(self):
"""Test component integration in a Haystack pipeline."""
pattern = r'<issue url="(.+?)">'
extractor = RegexTextExtractor(regex_pattern=pattern)
pipe = Pipeline()
pipe.add_component("extractor", extractor)
text = '<issue url="github.com/pipeline-test">pipeline test</issue>'
result = pipe.run(data={"extractor": {"text_or_messages": text}})
assert result["extractor"] == {"captured_text": "github.com/pipeline-test"}
def test_pipeline_integration_with_chat_messages(self):
"""Test component integration in pipeline with ChatMessages."""
pattern = r'<issue url="(.+?)">'
extractor = RegexTextExtractor(regex_pattern=pattern)
pipe = Pipeline()
pipe.add_component("extractor", extractor)
messages = [ChatMessage.from_user('<issue url="github.com/chat-test">chat test</issue>')]
result = pipe.run(data={"extractor": {"text_or_messages": messages}})
assert result["extractor"] == {"captured_text": "github.com/chat-test"}
def test_very_long_text(self):
pattern = r"(\d+)"
extractor = RegexTextExtractor(regex_pattern=pattern)
long_text = "a" * 10000 + "123" + "b" * 10000
result = extractor.run(text_or_messages=long_text)
assert result == {"captured_text": "123"}
def test_multiple_matches_first_is_captured(self):
pattern = r"(\d+)"
extractor = RegexTextExtractor(regex_pattern=pattern)
text = "First: 123, Second: 456, Third: 789"
result = extractor.run(text_or_messages=text)
assert result == {"captured_text": "123"}