mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-01 09:27:28 +00:00
feat: adding RegextTextExtractor component from experimental (#9879)
* initial import of component * adding release notes * adding docs to docusaurus
This commit is contained in:
parent
fe60c765d9
commit
cfa5d27614
@ -1,7 +1,12 @@
|
||||
loaders:
|
||||
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
|
||||
search_path: [../../../haystack/components/extractors]
|
||||
modules: ["named_entity_extractor", "llm_metadata_extractor", "image/llm_document_content_extractor"]
|
||||
modules: [
|
||||
"named_entity_extractor",
|
||||
"llm_metadata_extractor",
|
||||
"image/llm_document_content_extractor",
|
||||
"regex_text_extractor",
|
||||
]
|
||||
ignore_when_discovered: ["__init__"]
|
||||
processors:
|
||||
- type: filter
|
||||
@ -15,7 +20,7 @@ processors:
|
||||
- type: crossref
|
||||
renderer:
|
||||
type: haystack_pydoc_tools.renderers.ReadmeCoreRenderer
|
||||
excerpt: Extracts predefined entities out of a piece of text.
|
||||
excerpt: Components to extract specific elements from textual data.
|
||||
category_slug: haystack-api
|
||||
title: Extractors
|
||||
slug: extractors-api
|
||||
|
||||
@ -1,7 +1,12 @@
|
||||
loaders:
|
||||
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
|
||||
search_path: [../../../haystack/components/extractors]
|
||||
modules: ["named_entity_extractor", "llm_metadata_extractor", "image/llm_document_content_extractor"]
|
||||
modules: [
|
||||
"named_entity_extractor",
|
||||
"llm_metadata_extractor",
|
||||
"image/llm_document_content_extractor",
|
||||
"regex_text_extractor",
|
||||
]
|
||||
ignore_when_discovered: ["__init__"]
|
||||
processors:
|
||||
- type: filter
|
||||
@ -14,10 +19,12 @@ processors:
|
||||
- type: smart
|
||||
- type: crossref
|
||||
renderer:
|
||||
type: haystack_pydoc_tools.renderers.DocusaurusRenderer
|
||||
description: Extracts predefined entities out of a piece of text.
|
||||
type: haystack_pydoc_tools.renderers.ReadmeCoreRenderer
|
||||
excerpt: Components to extract specific elements from textual data.
|
||||
category_slug: haystack-api
|
||||
title: Extractors
|
||||
id: extractors-api
|
||||
slug: extractors-api
|
||||
order: 65
|
||||
markdown:
|
||||
descriptive_class_title: false
|
||||
classdef_code_block: false
|
||||
|
||||
@ -9,6 +9,7 @@ from lazy_imports import LazyImporter
|
||||
|
||||
_import_structure = {
|
||||
"llm_metadata_extractor": ["LLMMetadataExtractor"],
|
||||
"regex_text_extractor": ["RegexTextExtractor"],
|
||||
"named_entity_extractor": ["NamedEntityAnnotation", "NamedEntityExtractor", "NamedEntityExtractorBackend"],
|
||||
}
|
||||
|
||||
@ -17,6 +18,7 @@ if TYPE_CHECKING:
|
||||
from .named_entity_extractor import NamedEntityAnnotation as NamedEntityAnnotation
|
||||
from .named_entity_extractor import NamedEntityExtractor as NamedEntityExtractor
|
||||
from .named_entity_extractor import NamedEntityExtractorBackend as NamedEntityExtractorBackend
|
||||
from .regex_text_extractor import RegexTextExtractor
|
||||
|
||||
else:
|
||||
sys.modules[__name__] = LazyImporter(name=__name__, module_file=__file__, import_structure=_import_structure)
|
||||
|
||||
117
haystack/components/extractors/regex_text_extractor.py
Normal file
117
haystack/components/extractors/regex_text_extractor.py
Normal file
@ -0,0 +1,117 @@
|
||||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import re
|
||||
from typing import Union
|
||||
|
||||
from haystack import component, logging
|
||||
from haystack.dataclasses import ChatMessage
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@component
|
||||
class RegexTextExtractor:
|
||||
"""
|
||||
Extracts text from chat message or string input using a regex pattern.
|
||||
|
||||
RegexTextExtractor parses input text or ChatMessages using a provided regular expression pattern.
|
||||
It can be configured to search through all messages or only the last message in a list of ChatMessages.
|
||||
|
||||
### Usage example
|
||||
|
||||
```python
|
||||
from haystack_experimental.components.extractors import RegexTextExtractor
|
||||
from haystack.dataclasses import ChatMessage
|
||||
|
||||
# Using with a string
|
||||
parser = RegexTextExtractor(regex_pattern='<issue url=\"(.+)\">')
|
||||
result = parser.run(text_or_messages='<issue url="github.com/hahahaha">hahahah</issue>')
|
||||
# result: {"captured_text": "github.com/hahahaha"}
|
||||
|
||||
# Using with ChatMessages
|
||||
messages = [ChatMessage.from_user('<issue url="github.com/hahahaha">hahahah</issue>')]
|
||||
result = parser.run(text_or_messages=messages)
|
||||
# result: {"captured_text": "github.com/hahahaha"}
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, regex_pattern: str):
|
||||
"""
|
||||
Creates an instance of the RegexTextExtractor component.
|
||||
|
||||
:param regex_pattern:
|
||||
The regular expression pattern used to extract text.
|
||||
The pattern should include a capture group to extract the desired text.
|
||||
Example: '<issue url=\"(.+)\">' captures 'github.com/hahahaha' from '<issue url="github.com/hahahaha">'.
|
||||
"""
|
||||
self.regex_pattern = regex_pattern
|
||||
|
||||
# Check if the pattern has at least one capture group
|
||||
num_groups = re.compile(regex_pattern).groups
|
||||
if num_groups < 1:
|
||||
logger.warning(
|
||||
"The provided regex pattern {regex_pattern} doesn't contain any capture groups. "
|
||||
"The entire match will be returned instead.",
|
||||
regex_pattern=regex_pattern,
|
||||
)
|
||||
|
||||
@component.output_types(captured_text=str, captured_texts=list[str])
|
||||
def run(self, text_or_messages: Union[str, list[ChatMessage]]) -> dict:
|
||||
"""
|
||||
Extracts text from input using the configured regex pattern.
|
||||
|
||||
:param text_or_messages:
|
||||
Either a string or a list of ChatMessage objects to search through.
|
||||
|
||||
:returns:
|
||||
- If match found: {"captured_text": "matched text"}
|
||||
- If no match and return_empty_on_no_match=True: {}
|
||||
|
||||
:raises:
|
||||
- ValueError: if receiving a list the last element is not a ChatMessage instance.
|
||||
"""
|
||||
if isinstance(text_or_messages, str):
|
||||
return RegexTextExtractor._build_result(self._extract_from_text(text_or_messages))
|
||||
if not text_or_messages:
|
||||
logger.warning("Received empty list of messages")
|
||||
return {}
|
||||
return self._process_last_message(text_or_messages)
|
||||
|
||||
@staticmethod
|
||||
def _build_result(result: Union[str, list[str]]) -> dict:
|
||||
"""Helper method to build the return dictionary based on configuration."""
|
||||
if (isinstance(result, str) and result == "") or (isinstance(result, list) and not result):
|
||||
return {}
|
||||
return {"captured_text": result}
|
||||
|
||||
def _process_last_message(self, messages: list[ChatMessage]) -> dict:
|
||||
"""Process only the last message and build the result."""
|
||||
last_message = messages[-1]
|
||||
if not isinstance(last_message, ChatMessage):
|
||||
raise ValueError(f"Expected ChatMessage object, got {type(last_message)}")
|
||||
if last_message.text is None:
|
||||
logger.warning("Last message has no text content")
|
||||
return {}
|
||||
result = self._extract_from_text(last_message.text)
|
||||
return RegexTextExtractor._build_result(result)
|
||||
|
||||
def _extract_from_text(self, text: str) -> Union[str, list[str]]:
|
||||
"""
|
||||
Extract text using the regex pattern.
|
||||
|
||||
:param text:
|
||||
The text to search through.
|
||||
|
||||
:returns:
|
||||
The text captured by the first capturing group in the regex pattern.
|
||||
If the pattern has no capture groups, returns the entire match.
|
||||
If no match is found, returns an empty string.
|
||||
"""
|
||||
match = re.search(self.regex_pattern, text)
|
||||
if not match:
|
||||
return ""
|
||||
if match.groups():
|
||||
return match.group(1)
|
||||
return match.group(0)
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
A new component `RegexTextExtractor` which allows to extract text from chat messages or strings input based on custom regex pattern.
|
||||
190
test/components/extractors/test_regex_text_extractor.py
Normal file
190
test/components/extractors/test_regex_text_extractor.py
Normal file
@ -0,0 +1,190 @@
|
||||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import pytest
|
||||
|
||||
from haystack import Pipeline
|
||||
from haystack.components.extractors.regex_text_extractor import RegexTextExtractor
|
||||
from haystack.dataclasses import ChatMessage
|
||||
|
||||
|
||||
class TestRegexTextExtractor:
|
||||
def test_init_with_capture_group(self):
|
||||
pattern = r'<issue url="(.+?)">'
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
assert extractor.regex_pattern == pattern
|
||||
|
||||
def test_init_without_capture_group(self):
|
||||
pattern = r"<issue>"
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
assert extractor.regex_pattern == pattern
|
||||
|
||||
def test_extract_from_string_with_capture_group(self):
|
||||
pattern = r'<issue url="(.+?)">'
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
text = '<issue url="github.com/hahahaha">hahahah</issue>'
|
||||
result = extractor.run(text_or_messages=text)
|
||||
assert result == {"captured_text": "github.com/hahahaha"}
|
||||
|
||||
def test_extract_from_string_without_capture_group(self):
|
||||
pattern = r"<issue>"
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
text = "This is an <issue> tag in the text"
|
||||
result = extractor.run(text_or_messages=text)
|
||||
assert result == {"captured_text": "<issue>"}
|
||||
|
||||
def test_extract_from_string_no_match(self):
|
||||
pattern = r'<issue url="(.+?)">'
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
text = "This text has no matching pattern"
|
||||
result = extractor.run(text_or_messages=text)
|
||||
assert result == {}
|
||||
|
||||
def test_extract_from_string_empty_input(self):
|
||||
pattern = r'<issue url="(.+?)">'
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
text = ""
|
||||
result = extractor.run(text_or_messages=text)
|
||||
assert result == {}
|
||||
|
||||
def test_extract_from_chat_messages_single_message(self):
|
||||
pattern = r'<issue url="(.+?)">'
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
messages = [ChatMessage.from_user('<issue url="github.com/test">test issue</issue>')]
|
||||
result = extractor.run(text_or_messages=messages)
|
||||
assert result == {"captured_text": "github.com/test"}
|
||||
|
||||
def test_extract_from_chat_messages_multiple_messages(self):
|
||||
pattern = r'<issue url="(.+?)">'
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
messages = [
|
||||
ChatMessage.from_user('First message with <issue url="first.com">first</issue>'),
|
||||
ChatMessage.from_user('Second message with <issue url="second.com">second</issue>'),
|
||||
ChatMessage.from_user('Last message with <issue url="last.com">last</issue>'),
|
||||
]
|
||||
result = extractor.run(text_or_messages=messages)
|
||||
assert result == {"captured_text": "last.com"}
|
||||
|
||||
def test_extract_from_chat_messages_no_match_in_last(self):
|
||||
pattern = r'<issue url="(.+?)">'
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
messages = [
|
||||
ChatMessage.from_user('First message with <issue url="first.com">first</issue>'),
|
||||
ChatMessage.from_user("Last message with no matching pattern"),
|
||||
]
|
||||
result = extractor.run(text_or_messages=messages)
|
||||
assert result == {}
|
||||
|
||||
def test_extract_from_chat_messages_empty_list(self):
|
||||
pattern = r'<issue url="(.+?)">'
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
messages = []
|
||||
result = extractor.run(text_or_messages=messages)
|
||||
assert result == {}
|
||||
|
||||
def test_extract_from_chat_messages_invalid_type(self):
|
||||
pattern = r'<issue url="(.+?)">'
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
messages = ["not a ChatMessage object"]
|
||||
with pytest.raises(ValueError, match="Expected ChatMessage object, got <class 'str'>"):
|
||||
extractor.run(text_or_messages=messages)
|
||||
|
||||
def test_multiple_capture_groups(self):
|
||||
pattern = r"(\w+)@(\w+)\.(\w+)"
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
text = "Contact us at user@example.com for support"
|
||||
result = extractor.run(text_or_messages=text)
|
||||
# return the first capture group (username)
|
||||
assert result == {"captured_text": "user"}
|
||||
|
||||
def test_special_characters_in_pattern(self):
|
||||
"""Test regex pattern with special characters."""
|
||||
pattern = r"\[(\w+)\]"
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
|
||||
text = "This has [special] characters [in] brackets"
|
||||
result = extractor.run(text_or_messages=text)
|
||||
|
||||
assert result == {"captured_text": "special"}
|
||||
|
||||
def test_whitespace_handling(self):
|
||||
"""Test regex pattern with whitespace handling."""
|
||||
pattern = r"\s+(\w+)\s+"
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
|
||||
text = "word1 word2 word3"
|
||||
result = extractor.run(text_or_messages=text)
|
||||
|
||||
assert result == {"captured_text": "word2"}
|
||||
|
||||
def test_nested_capture_groups(self):
|
||||
"""Test regex with nested capture groups."""
|
||||
pattern = r'<(\w+)\s+attr="([^"]+)">'
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
|
||||
text = '<div attr="value">content</div>'
|
||||
result = extractor.run(text_or_messages=text)
|
||||
|
||||
# Should return the first capture group (tag name)
|
||||
assert result == {"captured_text": "div"}
|
||||
|
||||
def test_optional_capture_group(self):
|
||||
"""Test regex with optional capture group."""
|
||||
pattern = r"(\w+)(?:@(\w+))?"
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
|
||||
text = "username@domain"
|
||||
result = extractor.run(text_or_messages=text)
|
||||
|
||||
assert result == {"captured_text": "username"}
|
||||
|
||||
def test_optional_capture_group_no_match(self):
|
||||
"""Test regex with optional capture group when optional part is missing."""
|
||||
pattern = r"(\w+)(?:@(\w+))?"
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
|
||||
text = "username"
|
||||
result = extractor.run(text_or_messages=text)
|
||||
|
||||
assert result == {"captured_text": "username"}
|
||||
|
||||
def test_pipeline_integration(self):
|
||||
"""Test component integration in a Haystack pipeline."""
|
||||
pattern = r'<issue url="(.+?)">'
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
|
||||
pipe = Pipeline()
|
||||
pipe.add_component("extractor", extractor)
|
||||
|
||||
text = '<issue url="github.com/pipeline-test">pipeline test</issue>'
|
||||
result = pipe.run(data={"extractor": {"text_or_messages": text}})
|
||||
|
||||
assert result["extractor"] == {"captured_text": "github.com/pipeline-test"}
|
||||
|
||||
def test_pipeline_integration_with_chat_messages(self):
|
||||
"""Test component integration in pipeline with ChatMessages."""
|
||||
pattern = r'<issue url="(.+?)">'
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
|
||||
pipe = Pipeline()
|
||||
pipe.add_component("extractor", extractor)
|
||||
|
||||
messages = [ChatMessage.from_user('<issue url="github.com/chat-test">chat test</issue>')]
|
||||
result = pipe.run(data={"extractor": {"text_or_messages": messages}})
|
||||
|
||||
assert result["extractor"] == {"captured_text": "github.com/chat-test"}
|
||||
|
||||
def test_very_long_text(self):
|
||||
pattern = r"(\d+)"
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
long_text = "a" * 10000 + "123" + "b" * 10000
|
||||
result = extractor.run(text_or_messages=long_text)
|
||||
assert result == {"captured_text": "123"}
|
||||
|
||||
def test_multiple_matches_first_is_captured(self):
|
||||
pattern = r"(\d+)"
|
||||
extractor = RegexTextExtractor(regex_pattern=pattern)
|
||||
text = "First: 123, Second: 456, Third: 789"
|
||||
result = extractor.run(text_or_messages=text)
|
||||
assert result == {"captured_text": "123"}
|
||||
Loading…
x
Reference in New Issue
Block a user