remove batch_vectorizer.py under common

This commit is contained in:
huaidong.xhd 2024-10-24 17:28:14 +08:00
parent b2dfe9ee15
commit b46e78adde
10 changed files with 115 additions and 314 deletions

3
.gitignore vendored
View File

@ -11,4 +11,5 @@
*.jc
*.pyc
/dist
.vscode/
.vscode/
__pycache__/

View File

@ -15,13 +15,115 @@ from typing import List
from kag.builder.model.sub_graph import SubGraph
from knext.common.base.runnable import Input, Output
from kag.common.vectorizer import Vectorizer, Neo4jBatchVectorizer
from kag.common.vectorizer import Vectorizer
from kag.interface.builder.vectorizer_abc import VectorizerABC
from knext.schema.client import SchemaClient
from knext.project.client import ProjectClient
from knext.schema.model.base import IndexTypeEnum
class EmbeddingVectorPlaceholder(object):
def __init__(self, number, properties, vector_field, property_key, property_value):
self._number = number
self._properties = properties
self._vector_field = vector_field
self._property_key = property_key
self._property_value = property_value
self._embedding_vector = None
def replace(self):
if self._embedding_vector is not None:
self._properties[self._vector_field] = self._embedding_vector
def __repr__(self):
return repr(self._number)
class EmbeddingVectorManager(object):
def __init__(self):
self._placeholders = []
def _create_vector_field_name(self, property_key):
from kag.common.utils import to_snake_case
name = f"{property_key}_vector"
name = to_snake_case(name)
return "_" + name
def get_placeholder(self, properties, vector_field):
for property_key, property_value in properties.items():
field_name = self._create_vector_field_name(property_key)
if field_name != vector_field:
continue
if not property_value:
return None
if not isinstance(property_value, str):
message = f"property {property_key!r} must be string to generate embedding vector"
raise RuntimeError(message)
num = len(self._placeholders)
placeholder = EmbeddingVectorPlaceholder(
num, properties, vector_field, property_key, property_value
)
self._placeholders.append(placeholder)
return placeholder
return None
def _get_text_batch(self):
text_batch = dict()
for placeholder in self._placeholders:
property_value = placeholder._property_value
if property_value not in text_batch:
text_batch[property_value] = list()
text_batch[property_value].append(placeholder)
return text_batch
def _generate_vectors(self, vectorizer, text_batch):
texts = list(text_batch)
vectors = vectorizer.vectorize(texts)
return vectors
def _fill_vectors(self, vectors, text_batch):
for vector, (_text, placeholders) in zip(vectors, text_batch.items()):
for placeholder in placeholders:
placeholder._embedding_vector = vector
def batch_generate(self, vectorizer):
text_batch = self._get_text_batch()
vectors = self._generate_vectors(vectorizer, text_batch)
self._fill_vectors(vectors, text_batch)
def patch(self):
for placeholder in self._placeholders:
placeholder.replace()
class EmbeddingVectorGenerator(object):
def __init__(self, vectorizer, vector_index_meta=None, extra_labels=("Entity",)):
self._vectorizer = vectorizer
self._extra_labels = extra_labels
self._vector_index_meta = vector_index_meta or {}
def batch_generate(self, node_batch):
manager = EmbeddingVectorManager()
vector_index_meta = self._vector_index_meta
for node_item in node_batch:
label, properties = node_item
labels = [label]
if self._extra_labels:
labels.extend(self._extra_labels)
for label in labels:
if label not in vector_index_meta:
continue
for vector_field in vector_index_meta[label]:
if vector_field in properties:
continue
placeholder = manager.get_placeholder(properties, vector_field)
if placeholder is not None:
properties[vector_field] = placeholder
manager.batch_generate(self._vectorizer)
manager.patch()
class BatchVectorizer(VectorizerABC):
def __init__(self, project_id: str = None, **kwargs):
@ -70,7 +172,7 @@ class BatchVectorizer(VectorizerABC):
name = to_snake_case(name)
return "_" + name
def _neo4j_batch_vectorize(self, vectorizer: Vectorizer, input: SubGraph) -> SubGraph:
def _generate_embedding_vectors(self, vectorizer: Vectorizer, input: SubGraph) -> SubGraph:
node_list = []
node_batch = []
for node in input.nodes:
@ -80,8 +182,8 @@ class BatchVectorizer(VectorizerABC):
properties.update(node.properties)
node_list.append((node, properties))
node_batch.append((node.label, properties.copy()))
batch_vectorizer = Neo4jBatchVectorizer(vectorizer, self.vec_meta)
batch_vectorizer.batch_vectorize(node_batch)
generator = EmbeddingVectorGenerator(vectorizer, self.vec_meta)
generator.batch_generate(node_batch)
for (node, properties), (_node_label, new_properties) in zip(
node_list, node_batch
):
@ -92,5 +194,5 @@ class BatchVectorizer(VectorizerABC):
return input
def invoke(self, input: Input, **kwargs) -> List[Output]:
modified_input = self._neo4j_batch_vectorize(self.vectorizer, input)
modified_input = self._generate_embedding_vectors(self.vectorizer, input)
return [modified_input]

View File

@ -10,7 +10,6 @@
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied.
from kag.common.vectorizer.batch_vectorizer import Neo4jBatchVectorizer
from kag.common.vectorizer.local_bge_m3_vectorizer import LocalBGEM3Vectorizer
from kag.common.vectorizer.local_bge_vectorizer import LocalBGEVectorizer
from kag.common.vectorizer.openai_vectorizer import OpenAIVectorizer
@ -18,7 +17,6 @@ from kag.common.vectorizer.vectorizer import Vectorizer
__all__ = [
"Neo4jBatchVectorizer",
"LocalBGEM3Vectorizer",
"LocalBGEVectorizer",
"OpenAIVectorizer",

View File

@ -1,112 +0,0 @@
# Copyright 2023 OpenSPG Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied.
class Neo4jEmbeddingVectorPlaceholder(object):
def __init__(self, number, properties, vector_field, property_key, property_value):
self._number = number
self._properties = properties
self._vector_field = vector_field
self._property_key = property_key
self._property_value = property_value
self._embedding_vector = None
def replace(self):
if self._embedding_vector is not None:
self._properties[self._vector_field] = self._embedding_vector
def __repr__(self):
return repr(self._number)
class Neo4jEmbeddingVectorManager(object):
def __init__(self):
self._placeholders = []
def _create_vector_field_name(self, property_key):
from kag.common.utils import to_snake_case
name = f"{property_key}_vector"
name = to_snake_case(name)
return "_" + name
def get_placeholder(self, properties, vector_field):
for property_key, property_value in properties.items():
field_name = self._create_vector_field_name(property_key)
if field_name != vector_field:
continue
if not property_value:
return None
if not isinstance(property_value, str):
message = f"property {property_key!r} must be string to generate embedding vector"
raise RuntimeError(message)
num = len(self._placeholders)
placeholder = Neo4jEmbeddingVectorPlaceholder(
num, properties, vector_field, property_key, property_value
)
self._placeholders.append(placeholder)
return placeholder
return None
def _get_text_batch(self):
text_batch = dict()
for placeholder in self._placeholders:
property_value = placeholder._property_value
if property_value not in text_batch:
text_batch[property_value] = list()
text_batch[property_value].append(placeholder)
return text_batch
def _generate_vectors(self, vectorizer, text_batch):
texts = list(text_batch)
vectors = vectorizer.vectorize(texts)
return vectors
def _fill_vectors(self, vectors, text_batch):
for vector, (_text, placeholders) in zip(vectors, text_batch.items()):
for placeholder in placeholders:
placeholder._embedding_vector = vector
def batch_vectorize(self, vectorizer):
text_batch = self._get_text_batch()
vectors = self._generate_vectors(vectorizer, text_batch)
self._fill_vectors(vectors, text_batch)
def patch(self):
for placeholder in self._placeholders:
placeholder.replace()
class Neo4jBatchVectorizer(object):
def __init__(self, vectorizer, vector_index_meta=None, extra_labels=("Entity",)):
self._vectorizer = vectorizer
self._extra_labels = extra_labels
self._vector_index_meta = vector_index_meta or {}
def batch_vectorize(self, node_batch):
manager = Neo4jEmbeddingVectorManager()
vector_index_meta = self._vector_index_meta
for node_item in node_batch:
label, properties = node_item
labels = [label]
if self._extra_labels:
labels.extend(self._extra_labels)
for label in labels:
if label not in vector_index_meta:
continue
for vector_field in vector_index_meta[label]:
if vector_field in properties:
continue
placeholder = manager.get_placeholder(properties, vector_field)
if placeholder is not None:
properties[vector_field] = placeholder
manager.batch_vectorize(self._vectorizer)
manager.patch()

View File

@ -3,7 +3,7 @@ from typing import List
import numpy as np
from kag.common.vectorizer.vectorizer import Vectorizer
from kag.common.vectorizer import Vectorizer
def cosine_similarity(vector1, vector2):
@ -86,4 +86,4 @@ class TextSimilarity:
res = self.text_sim_result(mention, candidates, topk)
if len(res) == 0:
return [('Entity', 1.)]
return res
return res

View File

@ -1,132 +0,0 @@
# Copyright 2023 OpenSPG Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied.
import base64
import unittest
from tabulate import tabulate
from kag.common.vectorizer.vectorizer import Vectorizer
class TestDifferentVectorizers(unittest.TestCase):
"""Different vectorizers unit test"""
def _get_bge_zh_vectorizer(self):
path = base64.b64decode("fi8uY2FjaGUvdmVjdG9yaXplci9CQUFJL2JnZS1iYXNlLXpoLXYxLjU=").decode("utf-8")
host = base64.b64decode("YWxwcy1jb21tb24ub3NzLWNuLWhhbmd6aG91LXptZi5hbGl5dW5jcy5jb20=").decode("utf-8")
model = base64.b64decode("YWxwcy9odWFpZG9uZy54aGQvRG9jdW1lbnRzL21vZGVscy9CQUFJLWJnZS1iYXNlLXpoLXYxLjUudGFyLmd6").decode("utf-8")
config = {
"vectorizer": "kag.common.vectorizer.LocalVectorizer",
"path": path,
"url": "https://%s/%s" % (host, model),
}
vectorizer = Vectorizer.from_config(config)
return vectorizer
def _get_contriever_vectorizer(self):
path = base64.b64decode("fi8uY2FjaGUvdmVjdG9yaXplci9mYWNlYm9vay9jb250cmlldmVy").decode("utf-8")
host = base64.b64decode("YWxwcy1jb21tb24ub3NzLWNuLWhhbmd6aG91LXptZi5hbGl5dW5jcy5jb20=").decode("utf-8")
model = base64.b64decode("YWxwcy9odWFpZG9uZy54aGQvRG9jdW1lbnRzL21vZGVscy9mYWNlYm9vay1jb250cmlldmVyLnRhci5neg==").decode("utf-8")
config = {
"vectorizer": "kag.common.vectorizer.ContrieverVectorizer",
"path": path,
"url": "https://%s/%s" % (host, model),
"normalize": True,
}
vectorizer = Vectorizer.from_config(config)
return vectorizer
def _get_openai_vectorizer(self):
config = {
"vectorizer": "kag.common.vectorizer.OpenAIVectorizer",
"nn_name": "text-embedding-ada-002",
"openai_api_key": "EMPTY",
"openai_api_base": "http://127.0.0.1:38080/v1"
}
vectorizer = Vectorizer.from_config(config)
return vectorizer
def _get_bge_en_vectorizer(self):
path = base64.b64decode("fi8uY2FjaGUvdmVjdG9yaXplci9CQUFJL2JnZS1iYXNlLWVuLXYxLjU=").decode("utf-8")
host = base64.b64decode("YWxwcy1jb21tb24ub3NzLWNuLWhhbmd6aG91LXptZi5hbGl5dW5jcy5jb20=").decode("utf-8")
model = base64.b64decode("YWxwcy9odWFpZG9uZy54aGQvRG9jdW1lbnRzL21vZGVscy9CQUFJLWJnZS1iYXNlLWVuLXYxLjUudGFyLmd6").decode("utf-8")
config = {
"vectorizer": "kag.common.vectorizer.LocalVectorizer",
"path": path,
"url": "https://%s/%s" % (host, model),
}
vectorizer = Vectorizer.from_config(config)
return vectorizer
def _get_bge_m3_vectorizer(self):
path = base64.b64decode("fi8uY2FjaGUvdmVjdG9yaXplci9CQUFJL2JnZS1tMw==").decode("utf-8")
host = base64.b64decode("YWxwcy1jb21tb24ub3NzLWNuLWhhbmd6aG91LXptZi5hbGl5dW5jcy5jb20=").decode("utf-8")
model = base64.b64decode("YWxwcy9odWFpZG9uZy54aGQvRG9jdW1lbnRzL21vZGVscy9CQUFJLWJnZS1tMy50YXIuZ3o=").decode("utf-8")
config = {
"vectorizer": "kag.common.vectorizer.LocalBGEM3Vectorizer",
"path": path,
"url": "https://%s/%s" % (host, model),
}
vectorizer = Vectorizer.from_config(config)
return vectorizer
def _get_vectorizers(self):
vectorizers = (
("bge_zh", self._get_bge_zh_vectorizer()),
("contriever", self._get_contriever_vectorizer()),
("openai", self._get_openai_vectorizer()),
("bge_en", self._get_bge_en_vectorizer()),
("bge_m3", self._get_bge_m3_vectorizer()),
)
return vectorizers
def setUp(self):
self.vectorizers = self._get_vectorizers()
def tearDown(self):
pass
def testVectorize(self):
inputs = [
"George Washington",
"Father of the United States",
"President Washington",
"The American George",
"Washington the Great",
]
inputs2 = [
"诸葛亮",
"卧龙先生",
"诸葛丞相",
"武乡侯",
"孔明先生",
]
headers = ("#",) + tuple(name for name, _vectorizer in self.vectorizers)
columns = []
for _name, vectorizer in self.vectorizers:
column = []
vecs = vectorizer.vectorize(inputs)
for vec in vecs:
similarity = sum(x * y for x, y in zip(vecs[0], vec))
column.append(similarity)
columns.append(column)
data = []
for i in range(len(columns[0])):
row = [i]
for column in columns:
row.append(column[i])
data.append(row)
string = tabulate(data, headers=headers)
print(string)
if __name__ == "__main__":
unittest.main()

View File

@ -1,56 +0,0 @@
# Copyright 2023 OpenSPG Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied.
import base64
import unittest
from kag.common.vectorizer.vectorizer import Vectorizer
class TestVectorizer(unittest.TestCase):
"""Vectorizer unit test"""
def _get_local_vectorizer(self):
path = base64.b64decode("fi8uY2FjaGUvdmVjdG9yaXplci9CQUFJL2JnZS1iYXNlLXpoLXYxLjU=").decode("utf-8")
host = base64.b64decode("YWxwcy9odWFpZG9uZy54aGQvRG9jdW1lbnRzL21vZGVscy9CQUFJLWJnZS1iYXNlLXpoLXYxLjUudGFyLmd6").decode("utf-8")
model = base64.b64decode("YWxwcy9odWFpZG9uZy54aGQvRG9jdW1lbnRzL21vZGVscy9CQUFJLWJnZS1iYXNlLXpoLXYxLjUudGFyLmd6").decode("utf-8")
config = {
"vectorizer": "kag.common.vectorizer.LocalVectorizer",
"path": path,
"url": "https://%s/inference/%s" % (host, model),
}
vectorizer = Vectorizer.from_config(config)
return vectorizer
def _get_vectorizer(self, *, local=False):
if local:
vectorizer = self._get_local_vectorizer()
else:
vectorizer = self._get_maya_vectorizer()
return vectorizer
def setUp(self):
self.vectorizer = self._get_vectorizer(local=False)
def tearDown(self):
pass
def testVectorize(self):
texts = ["How old are you?", "What is your age?"]
vecs = self.vectorizer.vectorize(texts)
similarity = sum(x * y for x, y in zip(*vecs))
print("similarity: %g" % similarity)
self.assertTrue(similarity >= 0.75)
if __name__ == "__main__":
unittest.main()

View File

@ -6,7 +6,7 @@ import numpy as np
from kag.common.env import init_kag_config
from kag.common.graphstore.neo4j_graph_store import Neo4jClient
from kag.common.vectorizer.vectorizer import Vectorizer
from kag.common.vectorizer import Vectorizer
from kag.interface.solver.lf_planner_abc import LFPlannerABC
from kag.solver.implementation.default_reasoner import DefaultReasoner
from kag.solver.implementation.lf_chunk_retriever import LFChunkRetriever
@ -217,4 +217,4 @@ class KgCATest(unittest.TestCase):
def test_re_failed_resp_7(self):
res = self.run_question("""Which city was the band which was formed in 1981 in Los Angeles when vocalist/guitarist James Hetfield responded to an advertisement posted by drummer Lars Ulrich in a local newspaper hosted by L'Amour?""")
assert "New York" in res
assert "New York" in res

View File

@ -3,7 +3,7 @@ import unittest
from kag.common.env import init_kag_config
from kag.common.graphstore.neo4j_graph_store import Neo4jClient
from kag.common.vectorizer.vectorizer import Vectorizer
from kag.common.vectorizer import Vectorizer
from kag.solver.implementation.default_kg_retrieval import KGRetrieverByLlm
from kag.solver.implementation.default_lf_planner import DefaultLFPlanner
from kag.solver.implementation.lf_chunk_retriever import LFChunkRetriever

View File

@ -1,6 +1,6 @@
import unittest
import os
from kag.common.vectorizer.vectorizer import Vectorizer
from kag.common.vectorizer import Vectorizer
class TestOllamaVectorizer(unittest.TestCase):