feat: adding RegextTextExtractor component from experimental (#9879)

* initial import of component * adding release notes * adding docs to docusaurus
2025-12-01 09:27:28 +00:00 · 2025-10-15 13:55:22 +02:00 · 2025-10-15 13:55:22 +02:00 · cfa5d27614
commit cfa5d27614
parent fe60c765d9
6 changed files with 331 additions and 6 deletions
--- a/docs/pydoc/config/extractors_api.yml
+++ b/docs/pydoc/config/extractors_api.yml
@ -1,7 +1,12 @@
 loaders:
  - type: haystack_pydoc_tools.loaders.CustomPythonLoader
    search_path: [../../../haystack/components/extractors]
-    modules: ["named_entity_extractor", "llm_metadata_extractor", "image/llm_document_content_extractor"]
+    modules: [
+      "named_entity_extractor",
+      "llm_metadata_extractor",
+      "image/llm_document_content_extractor",
+      "regex_text_extractor",
+    ]
    ignore_when_discovered: ["__init__"]
 processors:
  - type: filter
@ -15,7 +20,7 @@ processors:
  - type: crossref
 renderer:
  type: haystack_pydoc_tools.renderers.ReadmeCoreRenderer
-  excerpt: Extracts predefined entities out of a piece of text.
+  excerpt: Components to extract specific elements from textual data.
  category_slug: haystack-api
  title: Extractors
  slug: extractors-api
--- a/docs/pydoc/config_docusaurus/extractors_api.yml
+++ b/docs/pydoc/config_docusaurus/extractors_api.yml
@ -1,7 +1,12 @@
 loaders:
  - type: haystack_pydoc_tools.loaders.CustomPythonLoader
    search_path: [../../../haystack/components/extractors]
-    modules: ["named_entity_extractor", "llm_metadata_extractor", "image/llm_document_content_extractor"]
+    modules: [
+      "named_entity_extractor",
+      "llm_metadata_extractor",
+      "image/llm_document_content_extractor",
+      "regex_text_extractor",
+    ]
    ignore_when_discovered: ["__init__"]
 processors:
  - type: filter
@ -14,10 +19,12 @@ processors:
  - type: smart
  - type: crossref
 renderer:
-  type: haystack_pydoc_tools.renderers.DocusaurusRenderer
-  description: Extracts predefined entities out of a piece of text.
+  type: haystack_pydoc_tools.renderers.ReadmeCoreRenderer
+  excerpt: Components to extract specific elements from textual data.
+  category_slug: haystack-api
  title: Extractors
-  id: extractors-api
+  slug: extractors-api
+  order: 65
  markdown:
    descriptive_class_title: false
    classdef_code_block: false
--- a/haystack/components/extractors/init.py
+++ b/haystack/components/extractors/init.py
@ -9,6 +9,7 @@ from lazy_imports import LazyImporter

 _import_structure = {
    "llm_metadata_extractor": ["LLMMetadataExtractor"],
+    "regex_text_extractor": ["RegexTextExtractor"],
    "named_entity_extractor": ["NamedEntityAnnotation", "NamedEntityExtractor", "NamedEntityExtractorBackend"],
 }

@ -17,6 +18,7 @@ if TYPE_CHECKING:
    from .named_entity_extractor import NamedEntityAnnotation as NamedEntityAnnotation
    from .named_entity_extractor import NamedEntityExtractor as NamedEntityExtractor
    from .named_entity_extractor import NamedEntityExtractorBackend as NamedEntityExtractorBackend
+    from .regex_text_extractor import RegexTextExtractor

 else:
    sys.modules[__name__] = LazyImporter(name=__name__, module_file=__file__, import_structure=_import_structure)
--- a/haystack/components/extractors/regex_text_extractor.py
+++ b/haystack/components/extractors/regex_text_extractor.py
@ -0,0 +1,117 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import re
+from typing import Union
+
+from haystack import component, logging
+from haystack.dataclasses import ChatMessage
+
+logger = logging.getLogger(__name__)
+
+
+@component
+class RegexTextExtractor:
+    """
+    Extracts text from chat message or string input using a regex pattern.
+
+    RegexTextExtractor parses input text or ChatMessages using a provided regular expression pattern.
+    It can be configured to search through all messages or only the last message in a list of ChatMessages.
+
+    ### Usage example
+
+    ```python
+    from haystack_experimental.components.extractors import RegexTextExtractor
+    from haystack.dataclasses import ChatMessage
+
+    # Using with a string
+    parser = RegexTextExtractor(regex_pattern='<issue url=\"(.+)\">')
+    result = parser.run(text_or_messages='<issue url="github.com/hahahaha">hahahah</issue>')
+    # result: {"captured_text": "github.com/hahahaha"}
+
+    # Using with ChatMessages
+    messages = [ChatMessage.from_user('<issue url="github.com/hahahaha">hahahah</issue>')]
+    result = parser.run(text_or_messages=messages)
+    # result: {"captured_text": "github.com/hahahaha"}
+    ```
+    """
+
+    def __init__(self, regex_pattern: str):
+        """
+        Creates an instance of the RegexTextExtractor component.
+
+        :param regex_pattern:
+            The regular expression pattern used to extract text.
+            The pattern should include a capture group to extract the desired text.
+            Example: '<issue url=\"(.+)\">' captures 'github.com/hahahaha' from '<issue url="github.com/hahahaha">'.
+        """
+        self.regex_pattern = regex_pattern
+
+        # Check if the pattern has at least one capture group
+        num_groups = re.compile(regex_pattern).groups
+        if num_groups < 1:
+            logger.warning(
+                "The provided regex pattern {regex_pattern} doesn't contain any capture groups. "
+                "The entire match will be returned instead.",
+                regex_pattern=regex_pattern,
+            )
+
+    @component.output_types(captured_text=str, captured_texts=list[str])
+    def run(self, text_or_messages: Union[str, list[ChatMessage]]) -> dict:
+        """
+        Extracts text from input using the configured regex pattern.
+
+        :param text_or_messages:
+            Either a string or a list of ChatMessage objects to search through.
+
+        :returns:
+          - If match found: {"captured_text": "matched text"}
+          - If no match and return_empty_on_no_match=True: {}
+
+        :raises:
+            - ValueError: if receiving a list the last element is not a ChatMessage instance.
+        """
+        if isinstance(text_or_messages, str):
+            return RegexTextExtractor._build_result(self._extract_from_text(text_or_messages))
+        if not text_or_messages:
+            logger.warning("Received empty list of messages")
+            return {}
+        return self._process_last_message(text_or_messages)
+
+    @staticmethod
+    def _build_result(result: Union[str, list[str]]) -> dict:
+        """Helper method to build the return dictionary based on configuration."""
+        if (isinstance(result, str) and result == "") or (isinstance(result, list) and not result):
+            return {}
+        return {"captured_text": result}
+
+    def _process_last_message(self, messages: list[ChatMessage]) -> dict:
+        """Process only the last message and build the result."""
+        last_message = messages[-1]
+        if not isinstance(last_message, ChatMessage):
+            raise ValueError(f"Expected ChatMessage object, got {type(last_message)}")
+        if last_message.text is None:
+            logger.warning("Last message has no text content")
+            return {}
+        result = self._extract_from_text(last_message.text)
+        return RegexTextExtractor._build_result(result)
+
+    def _extract_from_text(self, text: str) -> Union[str, list[str]]:
+        """
+        Extract text using the regex pattern.
+
+        :param text:
+            The text to search through.
+
+        :returns:
+            The text captured by the first capturing group in the regex pattern.
+            If the pattern has no capture groups, returns the entire match.
+            If no match is found, returns an empty string.
+        """
+        match = re.search(self.regex_pattern, text)
+        if not match:
+            return ""
+        if match.groups():
+            return match.group(1)
+        return match.group(0)
--- a/releasenotes/notes/adding-RegexTextExtractor-7437afdf2c54d596.yaml
+++ b/releasenotes/notes/adding-RegexTextExtractor-7437afdf2c54d596.yaml
@ -0,0 +1,4 @@
+---
+features:
+  - |
+    A new component `RegexTextExtractor` which allows to extract text from chat messages or strings input based on custom regex pattern.
--- a/test/components/extractors/test_regex_text_extractor.py
+++ b/test/components/extractors/test_regex_text_extractor.py
@ -0,0 +1,190 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from haystack import Pipeline
+from haystack.components.extractors.regex_text_extractor import RegexTextExtractor
+from haystack.dataclasses import ChatMessage
+
+
+class TestRegexTextExtractor:
+    def test_init_with_capture_group(self):
+        pattern = r'<issue url="(.+?)">'
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+        assert extractor.regex_pattern == pattern
+
+    def test_init_without_capture_group(self):
+        pattern = r"<issue>"
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+        assert extractor.regex_pattern == pattern
+
+    def test_extract_from_string_with_capture_group(self):
+        pattern = r'<issue url="(.+?)">'
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+        text = '<issue url="github.com/hahahaha">hahahah</issue>'
+        result = extractor.run(text_or_messages=text)
+        assert result == {"captured_text": "github.com/hahahaha"}
+
+    def test_extract_from_string_without_capture_group(self):
+        pattern = r"<issue>"
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+        text = "This is an <issue> tag in the text"
+        result = extractor.run(text_or_messages=text)
+        assert result == {"captured_text": "<issue>"}
+
+    def test_extract_from_string_no_match(self):
+        pattern = r'<issue url="(.+?)">'
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+        text = "This text has no matching pattern"
+        result = extractor.run(text_or_messages=text)
+        assert result == {}
+
+    def test_extract_from_string_empty_input(self):
+        pattern = r'<issue url="(.+?)">'
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+        text = ""
+        result = extractor.run(text_or_messages=text)
+        assert result == {}
+
+    def test_extract_from_chat_messages_single_message(self):
+        pattern = r'<issue url="(.+?)">'
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+        messages = [ChatMessage.from_user('<issue url="github.com/test">test issue</issue>')]
+        result = extractor.run(text_or_messages=messages)
+        assert result == {"captured_text": "github.com/test"}
+
+    def test_extract_from_chat_messages_multiple_messages(self):
+        pattern = r'<issue url="(.+?)">'
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+        messages = [
+            ChatMessage.from_user('First message with <issue url="first.com">first</issue>'),
+            ChatMessage.from_user('Second message with <issue url="second.com">second</issue>'),
+            ChatMessage.from_user('Last message with <issue url="last.com">last</issue>'),
+        ]
+        result = extractor.run(text_or_messages=messages)
+        assert result == {"captured_text": "last.com"}
+
+    def test_extract_from_chat_messages_no_match_in_last(self):
+        pattern = r'<issue url="(.+?)">'
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+        messages = [
+            ChatMessage.from_user('First message with <issue url="first.com">first</issue>'),
+            ChatMessage.from_user("Last message with no matching pattern"),
+        ]
+        result = extractor.run(text_or_messages=messages)
+        assert result == {}
+
+    def test_extract_from_chat_messages_empty_list(self):
+        pattern = r'<issue url="(.+?)">'
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+        messages = []
+        result = extractor.run(text_or_messages=messages)
+        assert result == {}
+
+    def test_extract_from_chat_messages_invalid_type(self):
+        pattern = r'<issue url="(.+?)">'
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+        messages = ["not a ChatMessage object"]
+        with pytest.raises(ValueError, match="Expected ChatMessage object, got <class 'str'>"):
+            extractor.run(text_or_messages=messages)
+
+    def test_multiple_capture_groups(self):
+        pattern = r"(\w+)@(\w+)\.(\w+)"
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+        text = "Contact us at user@example.com for support"
+        result = extractor.run(text_or_messages=text)
+        # return the first capture group (username)
+        assert result == {"captured_text": "user"}
+
+    def test_special_characters_in_pattern(self):
+        """Test regex pattern with special characters."""
+        pattern = r"\[(\w+)\]"
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+
+        text = "This has [special] characters [in] brackets"
+        result = extractor.run(text_or_messages=text)
+
+        assert result == {"captured_text": "special"}
+
+    def test_whitespace_handling(self):
+        """Test regex pattern with whitespace handling."""
+        pattern = r"\s+(\w+)\s+"
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+
+        text = "word1   word2   word3"
+        result = extractor.run(text_or_messages=text)
+
+        assert result == {"captured_text": "word2"}
+
+    def test_nested_capture_groups(self):
+        """Test regex with nested capture groups."""
+        pattern = r'<(\w+)\s+attr="([^"]+)">'
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+
+        text = '<div attr="value">content</div>'
+        result = extractor.run(text_or_messages=text)
+
+        # Should return the first capture group (tag name)
+        assert result == {"captured_text": "div"}
+
+    def test_optional_capture_group(self):
+        """Test regex with optional capture group."""
+        pattern = r"(\w+)(?:@(\w+))?"
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+
+        text = "username@domain"
+        result = extractor.run(text_or_messages=text)
+
+        assert result == {"captured_text": "username"}
+
+    def test_optional_capture_group_no_match(self):
+        """Test regex with optional capture group when optional part is missing."""
+        pattern = r"(\w+)(?:@(\w+))?"
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+
+        text = "username"
+        result = extractor.run(text_or_messages=text)
+
+        assert result == {"captured_text": "username"}
+
+    def test_pipeline_integration(self):
+        """Test component integration in a Haystack pipeline."""
+        pattern = r'<issue url="(.+?)">'
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+
+        pipe = Pipeline()
+        pipe.add_component("extractor", extractor)
+
+        text = '<issue url="github.com/pipeline-test">pipeline test</issue>'
+        result = pipe.run(data={"extractor": {"text_or_messages": text}})
+
+        assert result["extractor"] == {"captured_text": "github.com/pipeline-test"}
+
+    def test_pipeline_integration_with_chat_messages(self):
+        """Test component integration in pipeline with ChatMessages."""
+        pattern = r'<issue url="(.+?)">'
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+
+        pipe = Pipeline()
+        pipe.add_component("extractor", extractor)
+
+        messages = [ChatMessage.from_user('<issue url="github.com/chat-test">chat test</issue>')]
+        result = pipe.run(data={"extractor": {"text_or_messages": messages}})
+
+        assert result["extractor"] == {"captured_text": "github.com/chat-test"}
+
+    def test_very_long_text(self):
+        pattern = r"(\d+)"
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+        long_text = "a" * 10000 + "123" + "b" * 10000
+        result = extractor.run(text_or_messages=long_text)
+        assert result == {"captured_text": "123"}
+
+    def test_multiple_matches_first_is_captured(self):
+        pattern = r"(\d+)"
+        extractor = RegexTextExtractor(regex_pattern=pattern)
+        text = "First: 123, Second: 456, Third: 789"
+        result = extractor.run(text_or_messages=text)
+        assert result == {"captured_text": "123"}