build: Remove mmh3 dependency (#4896)

* build: Remove mmh3 dependency * resolve circular import * pylint * make mmh3.py sibling of schema.py * pylint import order * pylint * undo example changes * increase coverage in modeling module * increase coverage further * rename new unit tests
2026-01-06 12:07:04 +00:00 · 2023-05-17 21:31:08 +02:00 · 2023-05-17 21:31:08 +02:00 · 8cfeed095d
commit 8cfeed095d
parent df46e7fadd
9 changed files with 421 additions and 11 deletions
--- a/haystack/init.py
+++ b/haystack/init.py
@ -31,7 +31,6 @@ generalimport(
    "magic",
    "markdown",
    "mlflow",
-    "mmh3",
    "more_itertools",
    "networkx",
    "nltk",
@ -94,6 +93,7 @@ from haystack.schema import Document, Answer, Label, MultiLabel, Span, Evaluatio
 from haystack.nodes.base import BaseComponent
 from haystack.pipelines.base import Pipeline
 from haystack.environment import set_pytorch_secure_model_loading
+from haystack.mmh3 import hash128


 # Enables torch's secure model loading through setting an env var.
--- a/haystack/mmh3.py
+++ b/haystack/mmh3.py
@ -0,0 +1,344 @@
+import sys as _sys
+
+# based on https://github.com/wc-duck/pymmh3/blob/master/pymmh3.py
+
+if _sys.version_info > (3, 0):
+
+    def xrange(a, b, c):
+        return range(a, b, c)
+
+    def xencode(x):
+        if isinstance(x, (bytes, bytearray)):
+            return x
+        else:
+            return x.encode()
+
+else:
+
+    def xencode(x):
+        return x
+
+
+del _sys
+
+
+def hash128(key, seed=0x0, x64arch=True):
+    """Implements 128bit murmur3 hash."""
+
+    def hash128_x64(key, seed):
+        """Implements 128bit murmur3 hash for x64."""
+
+        def fmix(k):
+            k ^= k >> 33
+            k = (k * 0xFF51AFD7ED558CCD) & 0xFFFFFFFFFFFFFFFF
+            k ^= k >> 33
+            k = (k * 0xC4CEB9FE1A85EC53) & 0xFFFFFFFFFFFFFFFF
+            k ^= k >> 33
+            return k
+
+        length = len(key)
+        nblocks = int(length / 16)
+
+        h1 = seed
+        h2 = seed
+
+        c1 = 0x87C37B91114253D5
+        c2 = 0x4CF5AD432745937F
+
+        # body
+        for block_start in xrange(0, nblocks * 8, 8):
+            # ??? big endian?
+            k1 = (
+                key[2 * block_start + 7] << 56
+                | key[2 * block_start + 6] << 48
+                | key[2 * block_start + 5] << 40
+                | key[2 * block_start + 4] << 32
+                | key[2 * block_start + 3] << 24
+                | key[2 * block_start + 2] << 16
+                | key[2 * block_start + 1] << 8
+                | key[2 * block_start + 0]
+            )
+
+            k2 = (
+                key[2 * block_start + 15] << 56
+                | key[2 * block_start + 14] << 48
+                | key[2 * block_start + 13] << 40
+                | key[2 * block_start + 12] << 32
+                | key[2 * block_start + 11] << 24
+                | key[2 * block_start + 10] << 16
+                | key[2 * block_start + 9] << 8
+                | key[2 * block_start + 8]
+            )
+
+            k1 = (c1 * k1) & 0xFFFFFFFFFFFFFFFF
+            k1 = (k1 << 31 | k1 >> 33) & 0xFFFFFFFFFFFFFFFF  # inlined ROTL64
+            k1 = (c2 * k1) & 0xFFFFFFFFFFFFFFFF
+            h1 ^= k1
+
+            h1 = (h1 << 27 | h1 >> 37) & 0xFFFFFFFFFFFFFFFF  # inlined ROTL64
+            h1 = (h1 + h2) & 0xFFFFFFFFFFFFFFFF
+            h1 = (h1 * 5 + 0x52DCE729) & 0xFFFFFFFFFFFFFFFF
+
+            k2 = (c2 * k2) & 0xFFFFFFFFFFFFFFFF
+            k2 = (k2 << 33 | k2 >> 31) & 0xFFFFFFFFFFFFFFFF  # inlined ROTL64
+            k2 = (c1 * k2) & 0xFFFFFFFFFFFFFFFF
+            h2 ^= k2
+
+            h2 = (h2 << 31 | h2 >> 33) & 0xFFFFFFFFFFFFFFFF  # inlined ROTL64
+            h2 = (h1 + h2) & 0xFFFFFFFFFFFFFFFF
+            h2 = (h2 * 5 + 0x38495AB5) & 0xFFFFFFFFFFFFFFFF
+
+        # tail
+        tail_index = nblocks * 16
+        k1 = 0
+        k2 = 0
+        tail_size = length & 15
+
+        if tail_size >= 15:
+            k2 ^= key[tail_index + 14] << 48
+        if tail_size >= 14:
+            k2 ^= key[tail_index + 13] << 40
+        if tail_size >= 13:
+            k2 ^= key[tail_index + 12] << 32
+        if tail_size >= 12:
+            k2 ^= key[tail_index + 11] << 24
+        if tail_size >= 11:
+            k2 ^= key[tail_index + 10] << 16
+        if tail_size >= 10:
+            k2 ^= key[tail_index + 9] << 8
+        if tail_size >= 9:
+            k2 ^= key[tail_index + 8]
+
+        if tail_size > 8:
+            k2 = (k2 * c2) & 0xFFFFFFFFFFFFFFFF
+            k2 = (k2 << 33 | k2 >> 31) & 0xFFFFFFFFFFFFFFFF  # inlined ROTL64
+            k2 = (k2 * c1) & 0xFFFFFFFFFFFFFFFF
+            h2 ^= k2
+
+        if tail_size >= 8:
+            k1 ^= key[tail_index + 7] << 56
+        if tail_size >= 7:
+            k1 ^= key[tail_index + 6] << 48
+        if tail_size >= 6:
+            k1 ^= key[tail_index + 5] << 40
+        if tail_size >= 5:
+            k1 ^= key[tail_index + 4] << 32
+        if tail_size >= 4:
+            k1 ^= key[tail_index + 3] << 24
+        if tail_size >= 3:
+            k1 ^= key[tail_index + 2] << 16
+        if tail_size >= 2:
+            k1 ^= key[tail_index + 1] << 8
+        if tail_size >= 1:
+            k1 ^= key[tail_index + 0]
+
+        if tail_size > 0:
+            k1 = (k1 * c1) & 0xFFFFFFFFFFFFFFFF
+            k1 = (k1 << 31 | k1 >> 33) & 0xFFFFFFFFFFFFFFFF  # inlined ROTL64
+            k1 = (k1 * c2) & 0xFFFFFFFFFFFFFFFF
+            h1 ^= k1
+
+        # finalization
+        h1 ^= length
+        h2 ^= length
+
+        h1 = (h1 + h2) & 0xFFFFFFFFFFFFFFFF
+        h2 = (h1 + h2) & 0xFFFFFFFFFFFFFFFF
+
+        h1 = fmix(h1)
+        h2 = fmix(h2)
+
+        h1 = (h1 + h2) & 0xFFFFFFFFFFFFFFFF
+        h2 = (h1 + h2) & 0xFFFFFFFFFFFFFFFF
+
+        return h2 << 64 | h1
+
+    def hash128_x86(key, seed):
+        """Implements 128bit murmur3 hash for x86."""
+
+        def fmix(h):
+            h ^= h >> 16
+            h = (h * 0x85EBCA6B) & 0xFFFFFFFF
+            h ^= h >> 13
+            h = (h * 0xC2B2AE35) & 0xFFFFFFFF
+            h ^= h >> 16
+            return h
+
+        length = len(key)
+        nblocks = int(length / 16)
+
+        h1 = seed
+        h2 = seed
+        h3 = seed
+        h4 = seed
+
+        c1 = 0x239B961B
+        c2 = 0xAB0E9789
+        c3 = 0x38B34AE5
+        c4 = 0xA1E38B93
+
+        # body
+        for block_start in xrange(0, nblocks * 16, 16):
+            k1 = (
+                key[block_start + 3] << 24
+                | key[block_start + 2] << 16
+                | key[block_start + 1] << 8
+                | key[block_start + 0]
+            )
+
+            k2 = (
+                key[block_start + 7] << 24
+                | key[block_start + 6] << 16
+                | key[block_start + 5] << 8
+                | key[block_start + 4]
+            )
+
+            k3 = (
+                key[block_start + 11] << 24
+                | key[block_start + 10] << 16
+                | key[block_start + 9] << 8
+                | key[block_start + 8]
+            )
+
+            k4 = (
+                key[block_start + 15] << 24
+                | key[block_start + 14] << 16
+                | key[block_start + 13] << 8
+                | key[block_start + 12]
+            )
+
+            k1 = (c1 * k1) & 0xFFFFFFFF
+            k1 = (k1 << 15 | k1 >> 17) & 0xFFFFFFFF  # inlined ROTL32
+            k1 = (c2 * k1) & 0xFFFFFFFF
+            h1 ^= k1
+
+            h1 = (h1 << 19 | h1 >> 13) & 0xFFFFFFFF  # inlined ROTL32
+            h1 = (h1 + h2) & 0xFFFFFFFF
+            h1 = (h1 * 5 + 0x561CCD1B) & 0xFFFFFFFF
+
+            k2 = (c2 * k2) & 0xFFFFFFFF
+            k2 = (k2 << 16 | k2 >> 16) & 0xFFFFFFFF  # inlined ROTL32
+            k2 = (c3 * k2) & 0xFFFFFFFF
+            h2 ^= k2
+
+            h2 = (h2 << 17 | h2 >> 15) & 0xFFFFFFFF  # inlined ROTL32
+            h2 = (h2 + h3) & 0xFFFFFFFF
+            h2 = (h2 * 5 + 0x0BCAA747) & 0xFFFFFFFF
+
+            k3 = (c3 * k3) & 0xFFFFFFFF
+            k3 = (k3 << 17 | k3 >> 15) & 0xFFFFFFFF  # inlined ROTL32
+            k3 = (c4 * k3) & 0xFFFFFFFF
+            h3 ^= k3
+
+            h3 = (h3 << 15 | h3 >> 17) & 0xFFFFFFFF  # inlined ROTL32
+            h3 = (h3 + h4) & 0xFFFFFFFF
+            h3 = (h3 * 5 + 0x96CD1C35) & 0xFFFFFFFF
+
+            k4 = (c4 * k4) & 0xFFFFFFFF
+            k4 = (k4 << 18 | k4 >> 14) & 0xFFFFFFFF  # inlined ROTL32
+            k4 = (c1 * k4) & 0xFFFFFFFF
+            h4 ^= k4
+
+            h4 = (h4 << 13 | h4 >> 19) & 0xFFFFFFFF  # inlined ROTL32
+            h4 = (h1 + h4) & 0xFFFFFFFF
+            h4 = (h4 * 5 + 0x32AC3B17) & 0xFFFFFFFF
+
+        # tail
+        tail_index = nblocks * 16
+        k1 = 0
+        k2 = 0
+        k3 = 0
+        k4 = 0
+        tail_size = length & 15
+
+        if tail_size >= 15:
+            k4 ^= key[tail_index + 14] << 16
+        if tail_size >= 14:
+            k4 ^= key[tail_index + 13] << 8
+        if tail_size >= 13:
+            k4 ^= key[tail_index + 12]
+
+        if tail_size > 12:
+            k4 = (k4 * c4) & 0xFFFFFFFF
+            k4 = (k4 << 18 | k4 >> 14) & 0xFFFFFFFF  # inlined ROTL32
+            k4 = (k4 * c1) & 0xFFFFFFFF
+            h4 ^= k4
+
+        if tail_size >= 12:
+            k3 ^= key[tail_index + 11] << 24
+        if tail_size >= 11:
+            k3 ^= key[tail_index + 10] << 16
+        if tail_size >= 10:
+            k3 ^= key[tail_index + 9] << 8
+        if tail_size >= 9:
+            k3 ^= key[tail_index + 8]
+
+        if tail_size > 8:
+            k3 = (k3 * c3) & 0xFFFFFFFF
+            k3 = (k3 << 17 | k3 >> 15) & 0xFFFFFFFF  # inlined ROTL32
+            k3 = (k3 * c4) & 0xFFFFFFFF
+            h3 ^= k3
+
+        if tail_size >= 8:
+            k2 ^= key[tail_index + 7] << 24
+        if tail_size >= 7:
+            k2 ^= key[tail_index + 6] << 16
+        if tail_size >= 6:
+            k2 ^= key[tail_index + 5] << 8
+        if tail_size >= 5:
+            k2 ^= key[tail_index + 4]
+
+        if tail_size > 4:
+            k2 = (k2 * c2) & 0xFFFFFFFF
+            k2 = (k2 << 16 | k2 >> 16) & 0xFFFFFFFF  # inlined ROTL32
+            k2 = (k2 * c3) & 0xFFFFFFFF
+            h2 ^= k2
+
+        if tail_size >= 4:
+            k1 ^= key[tail_index + 3] << 24
+        if tail_size >= 3:
+            k1 ^= key[tail_index + 2] << 16
+        if tail_size >= 2:
+            k1 ^= key[tail_index + 1] << 8
+        if tail_size >= 1:
+            k1 ^= key[tail_index + 0]
+
+        if tail_size > 0:
+            k1 = (k1 * c1) & 0xFFFFFFFF
+            k1 = (k1 << 15 | k1 >> 17) & 0xFFFFFFFF  # inlined ROTL32
+            k1 = (k1 * c2) & 0xFFFFFFFF
+            h1 ^= k1
+
+        # finalization
+        h1 ^= length
+        h2 ^= length
+        h3 ^= length
+        h4 ^= length
+
+        h1 = (h1 + h2) & 0xFFFFFFFF
+        h1 = (h1 + h3) & 0xFFFFFFFF
+        h1 = (h1 + h4) & 0xFFFFFFFF
+        h2 = (h1 + h2) & 0xFFFFFFFF
+        h3 = (h1 + h3) & 0xFFFFFFFF
+        h4 = (h1 + h4) & 0xFFFFFFFF
+
+        h1 = fmix(h1)
+        h2 = fmix(h2)
+        h3 = fmix(h3)
+        h4 = fmix(h4)
+
+        h1 = (h1 + h2) & 0xFFFFFFFF
+        h1 = (h1 + h3) & 0xFFFFFFFF
+        h1 = (h1 + h4) & 0xFFFFFFFF
+        h2 = (h1 + h2) & 0xFFFFFFFF
+        h3 = (h1 + h3) & 0xFFFFFFFF
+        h4 = (h1 + h4) & 0xFFFFFFFF
+
+        return h4 << 96 | h3 << 64 | h2 << 32 | h1
+
+    key = bytearray(xencode(key))
+
+    if x64arch:
+        return hash128_x64(key, seed)
+    else:
+        return hash128_x86(key, seed)
--- a/haystack/schema.py
+++ b/haystack/schema.py
@ -18,7 +18,6 @@ import json
 import ast
 from dataclasses import asdict

-import mmh3
 import numpy as np
 from numpy import ndarray
 import pandas as pd
@ -32,6 +31,7 @@ from pydantic.json import pydantic_encoder
 from pydantic.dataclasses import dataclass

 from haystack import is_imported
+from haystack.mmh3 import hash128


 logger = logging.getLogger(__name__)
@ -147,7 +147,7 @@ class Document:
        """

        if id_hash_keys is None:
-            return "{:02x}".format(mmh3.hash128(str(self.content), signed=False))
+            return "{:02x}".format(hash128(str(self.content)))

        final_hash_key = ""
        for attr in id_hash_keys:
@ -163,7 +163,7 @@ class Document:
                "Can't create 'Document': 'id_hash_keys' must contain at least one of ['content', 'meta'] or be set to None."
            )

-        return "{:02x}".format(mmh3.hash128(final_hash_key, signed=False))
+        return "{:02x}".format(hash128(final_hash_key))

    def to_dict(self, field_map: Optional[Dict[str, Any]] = None) -> Dict:
        """
--- a/haystack/utils/squad_data.py
+++ b/haystack/utils/squad_data.py
@ -5,13 +5,12 @@ import json
 import random
 import pandas as pd
 from tqdm.auto import tqdm
-import mmh3

 from haystack import is_imported
+from haystack.mmh3 import hash128
 from haystack.schema import Document, Label, Answer
 from haystack.modeling.data_handler.processor import _read_squad_file

-
 logger = logging.getLogger(__name__)


@ -112,7 +111,7 @@ class SquadData:
            title = document.get("title", "")
            for paragraph in document["paragraphs"]:
                context = paragraph["context"]
-                document_id = paragraph.get("document_id", "{:02x}".format(mmh3.hash128(str(context), signed=False)))
+                document_id = paragraph.get("document_id", "{:02x}".format(hash128(str(context))))
                for question in paragraph["qas"]:
                    q = question["question"]
                    id = question["id"]
--- a/pyproject.toml
+++ b/pyproject.toml
@ -59,7 +59,6 @@ dependencies = [
  "dill",  # pickle extension for (de-)serialization
  "tqdm",  # progress bars in model download and training scripts
  "networkx",  # graphs library
-  "mmh3",  # fast hashing function (murmurhash3)
  "quantulum3",  # quantities extraction from text
  "posthog",  # telemetry
  "azure-ai-formrecognizer>=3.2.0b2",  # forms reader
--- a/test/agents/test_agent.py
+++ b/test/agents/test_agent.py
@ -7,7 +7,7 @@ from test.conftest import MockRetriever, MockPromptNode
 from unittest import mock
 import pytest

-from haystack import BaseComponent, Answer
+from haystack import BaseComponent, Answer, Document
 from haystack.agents import Agent, AgentStep
 from haystack.agents.base import Tool, ToolsManager
 from haystack.nodes import PromptModel, PromptNode, PromptTemplate
@ -276,6 +276,20 @@ def test_update_hash():
    assert agent.hash == "5ac8eca2f92c9545adcce3682b80d4c5"


+@pytest.mark.unit
+def test_tool_fails_processing_dict_result():
+    tool = Tool(name="name", pipeline_or_node=MockPromptNode(), description="description")
+    with pytest.raises(ValueError):
+        tool._process_result({"answer": "answer"})
+
+
+@pytest.mark.unit
+def test_tool_processes_answer_result_and_document_result():
+    tool = Tool(name="name", pipeline_or_node=MockPromptNode(), description="description")
+    assert tool._process_result(Answer(answer="answer")) == "answer"
+    assert tool._process_result(Document(content="content")) == "content"
+
+
 def test_invalid_agent_template():
    pn = PromptNode()
    with pytest.raises(ValueError, match="some_non_existing_template not supported"):
--- a/test/modeling/test_processor.py
+++ b/test/modeling/test_processor.py
@ -1,10 +1,11 @@
 import copy
 import logging
+from pathlib import Path

 import pytest
 from transformers import AutoTokenizer

-from haystack.modeling.data_handler.processor import SquadProcessor
+from haystack.modeling.data_handler.processor import SquadProcessor, _is_json


 # during inference (parameter return_baskets = False) we do not convert labels
@ -300,6 +301,17 @@ def test_dataset_from_dicts_qa_label_conversion(samples_path, caplog=None):
                    ], f"Processing labels for {model} has changed."


+@pytest.mark.unit
+def test_is_json_identifies_json_objects():
+    """Test that _is_json correctly identifies json objects"""
+    # Paths to json files should be considered json
+    assert _is_json(Path("processor_config.json"))
+    # dicts should be considered json
+    assert _is_json({"a": 1})
+    # non-serializable objects should not be considered json
+    assert not _is_json(AutoTokenizer)
+
+
@pytest.mark.integration
 def test_dataset_from_dicts_auto_determine_max_answers(samples_path, caplog=None):
    """
--- a/test/others/test_squad_data.py
+++ b/test/others/test_squad_data.py
@ -1,4 +1,6 @@
 import pandas as pd
+import pytest
+
 from haystack.utils.squad_data import SquadData
 from haystack.utils.augment_squad import augment_squad
 from haystack.schema import Document, Label, Answer
@ -22,7 +24,8 @@ def test_squad_augmentation(samples_path):
    assert original_squad.count(unit="paragraph") == augmented_squad.count(unit="paragraph") * multiplication_factor


-def test_squad_to_df():
+@pytest.mark.unit
+def test_squad_data_converts_df_to_data():
    df = pd.DataFrame(
        [["title", "context", "question", "id", "answer", 1, False]],
        columns=["title", "context", "question", "id", "answer_text", "answer_start", "is_impossible"],
@ -51,6 +54,35 @@ def test_squad_to_df():
    assert result == expected_result


+@pytest.mark.unit
+def test_squad_data_converts_data_to_df():
+    data = [
+        {
+            "title": "title",
+            "paragraphs": [
+                {
+                    "context": "context",
+                    "document_id": "document_id",
+                    "qas": [
+                        {
+                            "question": "question",
+                            "id": "id",
+                            "answers": [{"text": "answer", "answer_start": 1}],
+                            "is_impossible": False,
+                        }
+                    ],
+                }
+            ],
+        }
+    ]
+    expected_result = pd.DataFrame(
+        [["title", "context", "question", "id", "answer", 1, False, "document_id"]],
+        columns=["title", "context", "question", "id", "answer_text", "answer_start", "is_impossible", "document_id"],
+    )
+    result = SquadData.to_df(data)
+    assert result.equals(expected_result)
+
+
 def test_to_label_object():
    squad_data_list = [
        {
--- a/test/utils/test_mmh3.py
+++ b/test/utils/test_mmh3.py
@ -0,0 +1,10 @@
+import pytest
+
+from haystack.mmh3 import hash128
+
+
+@pytest.mark.unit
+def test_mmh3():
+    content = "This is the document text" * 100
+    hashed_content = hash128(content)
+    assert hashed_content == 305042678480070366459393623793278501577