mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-24 05:28:42 +00:00
Add InMemoryKnowledgeGraph (#2678)
* draft for InMemoryKnowledgeGraph * remove comments * Update Documentation & Code Style * fix import and signature * Fix dependencies for in_memory_knowlede_graph * updated tutorials * Update Documentation & Code Style * fix bug in notebook * fix other notebook bug * Update Documentation & Code Style * improved tutorial notebook * Update Documentation & Code Style * better implementation of InMemoryKnowledgeGraph * fix * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
b87c0c950b
commit
b01a7c2259
@ -22,7 +22,7 @@ The training of models that translate text queries into SPARQL queries is curren
|
||||
|
||||
# Install the latest master of Haystack
|
||||
!pip install --upgrade pip
|
||||
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,graphdb]
|
||||
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,inmemorygraph]
|
||||
```
|
||||
|
||||
|
||||
@ -34,7 +34,7 @@ import time
|
||||
from pathlib import Path
|
||||
|
||||
from haystack.nodes import Text2SparqlRetriever
|
||||
from haystack.document_stores import GraphDBKnowledgeGraph
|
||||
from haystack.document_stores import InMemoryKnowledgeGraph
|
||||
from haystack.utils import fetch_archive_from_http
|
||||
```
|
||||
|
||||
@ -54,44 +54,24 @@ s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/saved_models/hp_v3.4.zip"
|
||||
fetch_archive_from_http(url=s3_url, output_dir=model_dir)
|
||||
```
|
||||
|
||||
## Launching a GraphDB instance
|
||||
## Initialize a knowledge graph and load data
|
||||
|
||||
Currently, Haystack supports two alternative implementations for knowledge graphs:
|
||||
* simple InMemoryKnowledgeGraph (based on RDFLib in-memory store)
|
||||
* GraphDBKnowledgeGraph, which runs on GraphDB.
|
||||
|
||||
### InMemoryKnowledgeGraph
|
||||
|
||||
|
||||
```python
|
||||
# Unfortunately, there seems to be no good way to run GraphDB in colab environments
|
||||
# In your local environment, you could start a GraphDB server with docker
|
||||
# Feel free to check GraphDB's website for the free version https://www.ontotext.com/products/graphdb/graphdb-free/
|
||||
import os
|
||||
|
||||
LAUNCH_GRAPHDB = os.environ.get("LAUNCH_GRAPHDB", False)
|
||||
|
||||
if LAUNCH_GRAPHDB:
|
||||
print("Starting GraphDB ...")
|
||||
status = subprocess.run(
|
||||
[
|
||||
"docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11"
|
||||
],
|
||||
shell=True,
|
||||
)
|
||||
if status.returncode:
|
||||
raise Exception(
|
||||
"Failed to launch GraphDB. Maybe it is already running or you already have a container with that name that you could start?"
|
||||
)
|
||||
time.sleep(5)
|
||||
```
|
||||
|
||||
## Creating a new GraphDB repository (also known as index in haystack's document stores)
|
||||
|
||||
|
||||
```python
|
||||
# Initialize a knowledge graph connected to GraphDB and use "tutorial_10_index" as the name of the index
|
||||
kg = GraphDBKnowledgeGraph(index="tutorial_10_index")
|
||||
# Initialize a in memory knowledge graph and use "tutorial_10_index" as the name of the index
|
||||
kg = InMemoryKnowledgeGraph(index="tutorial_10_index")
|
||||
|
||||
# Delete the index as it might have been already created in previous runs
|
||||
kg.delete_index()
|
||||
|
||||
# Create the index based on a configuration file
|
||||
kg.create_index(config_path=Path(graph_dir) / "repo-config.ttl")
|
||||
# Create the index
|
||||
kg.create_index()
|
||||
|
||||
# Import triples of subject, predicate, and object statements from a ttl file
|
||||
kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir) / "triples.ttl")
|
||||
@ -99,15 +79,69 @@ print(f"The last triple stored in the knowledge graph is: {kg.get_all_triples()[
|
||||
print(f"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.")
|
||||
```
|
||||
|
||||
### GraphDBKnowledgeGraph (alternative)
|
||||
|
||||
#### Launching a GraphDB instance
|
||||
|
||||
|
||||
```python
|
||||
# Define prefixes for names of resources so that we can use shorter resource names in queries
|
||||
prefixes = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
||||
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
|
||||
PREFIX hp: <https://deepset.ai/harry_potter/>
|
||||
"""
|
||||
kg.prefixes = prefixes
|
||||
# # Unfortunately, there seems to be no good way to run GraphDB in colab environments
|
||||
# # In your local environment, you could start a GraphDB server with docker
|
||||
# # Feel free to check GraphDB's website for the free version https://www.ontotext.com/products/graphdb/graphdb-free/
|
||||
# import os
|
||||
|
||||
# LAUNCH_GRAPHDB = os.environ.get("LAUNCH_GRAPHDB", False)
|
||||
|
||||
# if LAUNCH_GRAPHDB:
|
||||
# print("Starting GraphDB ...")
|
||||
# status = subprocess.run(
|
||||
# [
|
||||
# "docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11"
|
||||
# ],
|
||||
# shell=True,
|
||||
# )
|
||||
# if status.returncode:
|
||||
# raise Exception(
|
||||
# "Failed to launch GraphDB. Maybe it is already running or you already have a container with that name that you could start?"
|
||||
# )
|
||||
# time.sleep(5)
|
||||
```
|
||||
|
||||
#### Creating a new GraphDB repository (also known as index in haystack's document stores)
|
||||
|
||||
|
||||
```python
|
||||
# from haystack.document_stores import GraphDBKnowledgeGraph
|
||||
|
||||
# # Initialize a knowledge graph connected to GraphDB and use "tutorial_10_index" as the name of the index
|
||||
# kg = GraphDBKnowledgeGraph(index="tutorial_10_index")
|
||||
|
||||
# # Delete the index as it might have been already created in previous runs
|
||||
# kg.delete_index()
|
||||
|
||||
# # Create the index based on a configuration file
|
||||
# kg.create_index(config_path=Path(graph_dir) / "repo-config.ttl")
|
||||
|
||||
# # Import triples of subject, predicate, and object statements from a ttl file
|
||||
# kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir) / "triples.ttl")
|
||||
# print(f"The last triple stored in the knowledge graph is: {kg.get_all_triples()[-1]}")
|
||||
# print(f"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.")
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
# # Define prefixes for names of resources so that we can use shorter resource names in queries
|
||||
# prefixes = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
||||
# PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
|
||||
# PREFIX hp: <https://deepset.ai/harry_potter/>
|
||||
# """
|
||||
# kg.prefixes = prefixes
|
||||
```
|
||||
|
||||
## Load the pre-trained retriever
|
||||
|
||||
|
||||
```python
|
||||
# Load a pre-trained model that translates text queries to SPARQL queries
|
||||
kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg, model_name_or_path=Path(model_dir) / "hp_v3.4")
|
||||
```
|
||||
|
||||
@ -23,3 +23,6 @@ else:
|
||||
MilvusDocumentStore = safe_import("haystack.document_stores.milvus2", "Milvus2DocumentStore", "milvus")
|
||||
WeaviateDocumentStore = safe_import("haystack.document_stores.weaviate", "WeaviateDocumentStore", "weaviate")
|
||||
GraphDBKnowledgeGraph = safe_import("haystack.document_stores.graphdb", "GraphDBKnowledgeGraph", "graphdb")
|
||||
InMemoryKnowledgeGraph = safe_import(
|
||||
"haystack.document_stores.memory_knowledgegraph", "InMemoryKnowledgeGraph", "inmemorygraph"
|
||||
)
|
||||
|
||||
137
haystack/document_stores/memory_knowledgegraph.py
Normal file
137
haystack/document_stores/memory_knowledgegraph.py
Normal file
@ -0,0 +1,137 @@
|
||||
from typing import Dict, Optional
|
||||
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
from rdflib import Graph
|
||||
|
||||
from haystack.document_stores import BaseKnowledgeGraph
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class InMemoryKnowledgeGraph(BaseKnowledgeGraph):
|
||||
"""
|
||||
In memory Knowledge graph store, based on rdflib.
|
||||
"""
|
||||
|
||||
def __init__(self, index: str = "document"):
|
||||
"""
|
||||
Init the in memory knowledge graph
|
||||
|
||||
:param index: name of the index
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self.indexes: Dict[str, Graph] = defaultdict(dict)
|
||||
self.index: str = index
|
||||
|
||||
def create_index(self, index: Optional[str] = None):
|
||||
"""
|
||||
Create a new index stored in memory
|
||||
|
||||
:param index: name of the index
|
||||
"""
|
||||
index = index or self.index
|
||||
if index not in self.indexes:
|
||||
self.indexes[index] = Graph()
|
||||
else:
|
||||
logger.warning(f"Index '{index}' is already present.")
|
||||
|
||||
def delete_index(self, index: Optional[str] = None):
|
||||
"""
|
||||
Delete an existing index. The index including all data will be removed.
|
||||
|
||||
:param index: The name of the index to delete.
|
||||
"""
|
||||
index = index or self.index
|
||||
|
||||
if index in self.indexes:
|
||||
del self.indexes[index]
|
||||
logger.info(f"Index '{index}' deleted.")
|
||||
|
||||
def import_from_ttl_file(self, path: Path, index: Optional[str] = None):
|
||||
"""
|
||||
Load in memory an existing knowledge graph represented in the form of triples of subject, predicate, and object from a .ttl file
|
||||
|
||||
:param path: path to a .ttl containing a knowledge graph
|
||||
:param index: name of the index
|
||||
"""
|
||||
index = index or self.index
|
||||
self.indexes[index].parse(path)
|
||||
|
||||
def get_all_triples(self, index: Optional[str] = None):
|
||||
"""
|
||||
Query the given in memory index for all its stored triples. Duplicates are not filtered.
|
||||
|
||||
:param index: name of the index
|
||||
:return: all triples stored in the index
|
||||
"""
|
||||
sparql_query = "SELECT * WHERE { ?s ?p ?o. }"
|
||||
results = self.query(sparql_query=sparql_query, index=index)
|
||||
return results
|
||||
|
||||
def get_all_subjects(self, index: Optional[str] = None):
|
||||
"""
|
||||
Query the given in memory index for all its stored subjects. Duplicates are not filtered.
|
||||
|
||||
:param index: name of the index
|
||||
:return: all subjects stored in the index
|
||||
"""
|
||||
sparql_query = "SELECT ?s WHERE { ?s ?p ?o. }"
|
||||
results = self.query(sparql_query=sparql_query, index=index)
|
||||
return results
|
||||
|
||||
def get_all_predicates(self, index: Optional[str] = None):
|
||||
"""
|
||||
Query the given in memory index for all its stored predicates. Duplicates are not filtered.
|
||||
|
||||
:param index: name of the index
|
||||
:return: all predicates stored in the index
|
||||
"""
|
||||
sparql_query = "SELECT ?p WHERE { ?s ?p ?o. }"
|
||||
results = self.query(sparql_query=sparql_query, index=index)
|
||||
return results
|
||||
|
||||
def _create_document_field_map(self) -> Dict:
|
||||
"""
|
||||
There is no field mapping required
|
||||
"""
|
||||
return {}
|
||||
|
||||
def get_all_objects(self, index: Optional[str] = None):
|
||||
"""
|
||||
Query the given in memory index for all its stored objects. Duplicates are not filtered.
|
||||
|
||||
:param index: name of the index
|
||||
:return: all objects stored in the index
|
||||
"""
|
||||
sparql_query = "SELECT ?o WHERE { ?s ?p ?o. }"
|
||||
results = self.query(sparql_query=sparql_query, index=index)
|
||||
return results
|
||||
|
||||
def query(self, sparql_query: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None):
|
||||
"""
|
||||
Execute a SPARQL query on the given in memory index
|
||||
|
||||
:param sparql_query: SPARQL query that shall be executed
|
||||
:param index: name of the index
|
||||
:return: query result
|
||||
"""
|
||||
index = index or self.index
|
||||
raw_results = self.indexes[index].query(sparql_query)
|
||||
|
||||
if raw_results.askAnswer is not None:
|
||||
return raw_results.askAnswer
|
||||
else:
|
||||
formatted_results = []
|
||||
for b in raw_results.bindings:
|
||||
formatted_result = {}
|
||||
items = list(b.items())
|
||||
for item in items:
|
||||
type_ = item[0].toPython()[1:]
|
||||
uri = item[1].toPython()
|
||||
formatted_result[type_] = {"type": "uri", "value": uri}
|
||||
formatted_results.append(formatted_result)
|
||||
return formatted_results
|
||||
@ -40,6 +40,9 @@
|
||||
{
|
||||
"$ref": "#/definitions/InMemoryDocumentStoreComponent"
|
||||
},
|
||||
{
|
||||
"$ref": "#/definitions/InMemoryKnowledgeGraphComponent"
|
||||
},
|
||||
{
|
||||
"$ref": "#/definitions/Milvus2DocumentStoreComponent"
|
||||
},
|
||||
@ -845,6 +848,40 @@
|
||||
],
|
||||
"additionalProperties": false
|
||||
},
|
||||
"InMemoryKnowledgeGraphComponent": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"title": "Name",
|
||||
"description": "Custom name for the component. Helpful for visualization and debugging.",
|
||||
"type": "string"
|
||||
},
|
||||
"type": {
|
||||
"title": "Type",
|
||||
"description": "Haystack Class name for the component.",
|
||||
"type": "string",
|
||||
"const": "InMemoryKnowledgeGraph"
|
||||
},
|
||||
"params": {
|
||||
"title": "Parameters",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"index": {
|
||||
"title": "Index",
|
||||
"default": "document",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"description": "Each parameter can reference other components defined in the same YAML file."
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"type",
|
||||
"name"
|
||||
],
|
||||
"additionalProperties": false
|
||||
},
|
||||
"Milvus2DocumentStoreComponent": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
||||
@ -146,10 +146,12 @@ pinecone =
|
||||
farm-haystack[sql,only-pinecone]
|
||||
graphdb =
|
||||
SPARQLWrapper
|
||||
inmemorygraph =
|
||||
SPARQLWrapper
|
||||
docstores =
|
||||
farm-haystack[faiss,milvus,weaviate,graphdb,pinecone]
|
||||
farm-haystack[faiss,milvus,weaviate,graphdb,inmemorygraph,pinecone]
|
||||
docstores-gpu =
|
||||
farm-haystack[faiss-gpu,milvus,weaviate,graphdb,pinecone]
|
||||
farm-haystack[faiss-gpu,milvus,weaviate,graphdb,inmemorygraph,pinecone]
|
||||
|
||||
audio =
|
||||
espnet
|
||||
|
||||
@ -3,7 +3,7 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
from haystack.nodes import Text2SparqlRetriever
|
||||
from haystack.document_stores import GraphDBKnowledgeGraph
|
||||
from haystack.document_stores import GraphDBKnowledgeGraph, InMemoryKnowledgeGraph
|
||||
from haystack.utils import fetch_archive_from_http
|
||||
|
||||
|
||||
@ -60,3 +60,50 @@ def test_graph_retrieval():
|
||||
sparql_query="select distinct ?obj where { <https://deepset.ai/harry_potter/Hermione_granger> <https://deepset.ai/harry_potter/patronus> ?obj . }"
|
||||
)
|
||||
assert result[0][0] == "https://deepset.ai/harry_potter/Otter"
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_inmemory_graph_retrieval():
|
||||
# TODO rename doc_dir
|
||||
graph_dir = "../data/tutorial10_knowledge_graph/"
|
||||
s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/triples_and_config.zip"
|
||||
fetch_archive_from_http(url=s3_url, output_dir=graph_dir)
|
||||
|
||||
# Fetch a pre-trained BART model that translates natural language questions to SPARQL queries
|
||||
model_dir = "../saved_models/tutorial10_knowledge_graph/"
|
||||
s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/saved_models/hp_v3.4.zip"
|
||||
fetch_archive_from_http(url=s3_url, output_dir=model_dir)
|
||||
|
||||
kg = InMemoryKnowledgeGraph(index="tutorial_10_index")
|
||||
kg.delete_index()
|
||||
kg.create_index()
|
||||
kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir + "triples.ttl"))
|
||||
triple = {
|
||||
"p": {"type": "uri", "value": "https://deepset.ai/harry_potter/_paternalgrandfather"},
|
||||
"s": {"type": "uri", "value": "https://deepset.ai/harry_potter/Melody_fawley"},
|
||||
"o": {"type": "uri", "value": "https://deepset.ai/harry_potter/Marshall_fawley"},
|
||||
}
|
||||
triples = kg.get_all_triples()
|
||||
assert len(triples) > 0
|
||||
assert triple in triples
|
||||
|
||||
kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg, model_name_or_path=model_dir + "hp_v3.4")
|
||||
|
||||
result = kgqa_retriever.retrieve(query="In which house is Harry Potter?")
|
||||
assert result[0] == {
|
||||
"answer": ["https://deepset.ai/harry_potter/Gryffindor"],
|
||||
"prediction_meta": {
|
||||
"model": "Text2SparqlRetriever",
|
||||
"sparql_query": "select ?a { hp:Harry_potter hp:house ?a . }",
|
||||
},
|
||||
}
|
||||
|
||||
result = kgqa_retriever._query_kg(
|
||||
sparql_query="select distinct ?sbj where { ?sbj hp:job hp:Keeper_of_keys_and_grounds . }"
|
||||
)
|
||||
assert result[0][0] == "https://deepset.ai/harry_potter/Rubeus_hagrid"
|
||||
|
||||
result = kgqa_retriever._query_kg(
|
||||
sparql_query="select distinct ?obj where { <https://deepset.ai/harry_potter/Hermione_granger> <https://deepset.ai/harry_potter/patronus> ?obj . }"
|
||||
)
|
||||
assert result[0][0] == "https://deepset.ai/harry_potter/Otter"
|
||||
|
||||
@ -3,7 +3,6 @@
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
@ -23,6 +22,9 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
@ -34,7 +36,7 @@
|
||||
"\n",
|
||||
"# Install the latest master of Haystack\n",
|
||||
"!pip install --upgrade pip\n",
|
||||
"!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,graphdb]"
|
||||
"!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,inmemorygraph]"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -42,6 +44,9 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
@ -55,14 +60,13 @@
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"from haystack.nodes import Text2SparqlRetriever\n",
|
||||
"from haystack.document_stores import GraphDBKnowledgeGraph\n",
|
||||
"from haystack.document_stores import InMemoryKnowledgeGraph\n",
|
||||
"from haystack.utils import fetch_archive_from_http"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
@ -76,6 +80,9 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
@ -96,80 +103,50 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Launching a GraphDB instance"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Unfortunately, there seems to be no good way to run GraphDB in colab environments\n",
|
||||
"# In your local environment, you could start a GraphDB server with docker\n",
|
||||
"# Feel free to check GraphDB's website for the free version https://www.ontotext.com/products/graphdb/graphdb-free/\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"LAUNCH_GRAPHDB = os.environ.get(\"LAUNCH_GRAPHDB\", False)\n",
|
||||
"\n",
|
||||
"if LAUNCH_GRAPHDB:\n",
|
||||
" print(\"Starting GraphDB ...\")\n",
|
||||
" status = subprocess.run(\n",
|
||||
" [\n",
|
||||
" \"docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11\"\n",
|
||||
" ],\n",
|
||||
" shell=True,\n",
|
||||
" )\n",
|
||||
" if status.returncode:\n",
|
||||
" raise Exception(\n",
|
||||
" \"Failed to launch GraphDB. Maybe it is already running or you already have a container with that name that you could start?\"\n",
|
||||
" )\n",
|
||||
" time.sleep(5)"
|
||||
"## Initialize a knowledge graph and load data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Creating a new GraphDB repository (also known as index in haystack's document stores)"
|
||||
"Currently, Haystack supports two alternative implementations for knowledge graphs:\n",
|
||||
"* simple InMemoryKnowledgeGraph (based on RDFLib in-memory store)\n",
|
||||
"* GraphDBKnowledgeGraph, which runs on GraphDB."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### InMemoryKnowledgeGraph "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The last triple stored in the knowledge graph is: {'s': {'type': 'uri', 'value': 'https://deepset.ai/harry_potter/Harry_potter'}, 'p': {'type': 'uri', 'value': 'https://deepset.ai/harry_potter/family'}, 'o': {'type': 'uri', 'value': 'https://deepset.ai/harry_potter/Dudley_dursleys_children'}}\n",
|
||||
"There are 118543 triples stored in the knowledge graph.\n"
|
||||
]
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
],
|
||||
"source": [
|
||||
"# Initialize a knowledge graph connected to GraphDB and use \"tutorial_10_index\" as the name of the index\n",
|
||||
"kg = GraphDBKnowledgeGraph(index=\"tutorial_10_index\")\n",
|
||||
"# Initialize a in memory knowledge graph and use \"tutorial_10_index\" as the name of the index\n",
|
||||
"kg = InMemoryKnowledgeGraph(index=\"tutorial_10_index\")\n",
|
||||
"\n",
|
||||
"# Delete the index as it might have been already created in previous runs\n",
|
||||
"kg.delete_index()\n",
|
||||
"\n",
|
||||
"# Create the index based on a configuration file\n",
|
||||
"kg.create_index(config_path=Path(graph_dir) / \"repo-config.ttl\")\n",
|
||||
"# Create the index\n",
|
||||
"kg.create_index()\n",
|
||||
"\n",
|
||||
"# Import triples of subject, predicate, and object statements from a ttl file\n",
|
||||
"kg.import_from_ttl_file(index=\"tutorial_10_index\", path=Path(graph_dir) / \"triples.ttl\")\n",
|
||||
@ -177,24 +154,140 @@
|
||||
"print(f\"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"jp-MarkdownHeadingCollapsed": true,
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"### GraphDBKnowledgeGraph (alternative)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"#### Launching a GraphDB instance"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define prefixes for names of resources so that we can use shorter resource names in queries\n",
|
||||
"prefixes = \"\"\"PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n",
|
||||
"PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>\n",
|
||||
"PREFIX hp: <https://deepset.ai/harry_potter/>\n",
|
||||
"\"\"\"\n",
|
||||
"kg.prefixes = prefixes\n",
|
||||
"# # Unfortunately, there seems to be no good way to run GraphDB in colab environments\n",
|
||||
"# # In your local environment, you could start a GraphDB server with docker\n",
|
||||
"# # Feel free to check GraphDB's website for the free version https://www.ontotext.com/products/graphdb/graphdb-free/\n",
|
||||
"# import os\n",
|
||||
"\n",
|
||||
"# LAUNCH_GRAPHDB = os.environ.get(\"LAUNCH_GRAPHDB\", False)\n",
|
||||
"\n",
|
||||
"# if LAUNCH_GRAPHDB:\n",
|
||||
"# print(\"Starting GraphDB ...\")\n",
|
||||
"# status = subprocess.run(\n",
|
||||
"# [\n",
|
||||
"# \"docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11\"\n",
|
||||
"# ],\n",
|
||||
"# shell=True,\n",
|
||||
"# )\n",
|
||||
"# if status.returncode:\n",
|
||||
"# raise Exception(\n",
|
||||
"# \"Failed to launch GraphDB. Maybe it is already running or you already have a container with that name that you could start?\"\n",
|
||||
"# )\n",
|
||||
"# time.sleep(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"#### Creating a new GraphDB repository (also known as index in haystack's document stores)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# from haystack.document_stores import GraphDBKnowledgeGraph\n",
|
||||
"\n",
|
||||
"# # Initialize a knowledge graph connected to GraphDB and use \"tutorial_10_index\" as the name of the index\n",
|
||||
"# kg = GraphDBKnowledgeGraph(index=\"tutorial_10_index\")\n",
|
||||
"\n",
|
||||
"# # Delete the index as it might have been already created in previous runs\n",
|
||||
"# kg.delete_index()\n",
|
||||
"\n",
|
||||
"# # Create the index based on a configuration file\n",
|
||||
"# kg.create_index(config_path=Path(graph_dir) / \"repo-config.ttl\")\n",
|
||||
"\n",
|
||||
"# # Import triples of subject, predicate, and object statements from a ttl file\n",
|
||||
"# kg.import_from_ttl_file(index=\"tutorial_10_index\", path=Path(graph_dir) / \"triples.ttl\")\n",
|
||||
"# print(f\"The last triple stored in the knowledge graph is: {kg.get_all_triples()[-1]}\")\n",
|
||||
"# print(f\"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# # Define prefixes for names of resources so that we can use shorter resource names in queries\n",
|
||||
"# prefixes = \"\"\"PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n",
|
||||
"# PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>\n",
|
||||
"# PREFIX hp: <https://deepset.ai/harry_potter/>\n",
|
||||
"# \"\"\"\n",
|
||||
"# kg.prefixes = prefixes"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load the pre-trained retriever"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load a pre-trained model that translates text queries to SPARQL queries\n",
|
||||
"kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg, model_name_or_path=Path(model_dir) / \"hp_v3.4\")"
|
||||
]
|
||||
@ -202,7 +295,6 @@
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
@ -218,14 +310,30 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Translating the text query \"In which house is Harry Potter?\" to a SPARQL query and executing it on the knowledge graph...\n",
|
||||
"[{'answer': ['https://deepset.ai/harry_potter/Gryffindor'], 'prediction_meta': {'model': 'Text2SparqlRetriever', 'sparql_query': 'select ?a { hp:Harry_potter hp:house ?a . }'}}]\n",
|
||||
"Executing a SPARQL query with prefixed names of resources...\n",
|
||||
"(['https://deepset.ai/harry_potter/Rubeus_hagrid', 'https://deepset.ai/harry_potter/Ogg'], 'select distinct ?sbj where { ?sbj hp:job hp:Keeper_of_keys_and_grounds . }')\n",
|
||||
"Executing a SPARQL query with full names of resources...\n",
|
||||
"(['https://deepset.ai/harry_potter/Otter'], 'select distinct ?obj where { <https://deepset.ai/harry_potter/Hermione_granger> <https://deepset.ai/harry_potter/patronus> ?obj . }')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"In which house is Harry Potter?\"\n",
|
||||
"print(f'Translating the text query \"{query}\" to a SPARQL query and executing it on the knowledge graph...')\n",
|
||||
@ -253,9 +361,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## About us\n",
|
||||
"\n",
|
||||
@ -278,23 +384,28 @@
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.6"
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.4"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "d6fc774dec8e6d4d8b6a5562b41269a570ea5456d1c03f28da35966a9134f033"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
||||
@ -5,10 +5,9 @@ import time
|
||||
from pathlib import Path
|
||||
|
||||
from haystack.nodes import Text2SparqlRetriever
|
||||
from haystack.document_stores import GraphDBKnowledgeGraph
|
||||
from haystack.document_stores import GraphDBKnowledgeGraph, InMemoryKnowledgeGraph
|
||||
from haystack.utils import fetch_archive_from_http
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ -24,46 +23,52 @@ def tutorial10_knowledge_graph():
|
||||
s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/saved_models/hp_v3.4.zip"
|
||||
fetch_archive_from_http(url=s3_url, output_dir=model_dir)
|
||||
|
||||
LAUNCH_GRAPHDB = os.environ.get("LAUNCH_GRAPHDB", True)
|
||||
|
||||
# Start a GraphDB server
|
||||
if LAUNCH_GRAPHDB:
|
||||
print("Starting GraphDB ...")
|
||||
status = subprocess.run(
|
||||
[
|
||||
"docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11"
|
||||
],
|
||||
shell=True,
|
||||
)
|
||||
if status.returncode:
|
||||
status = subprocess.run(["docker start graphdb-instance-tutorial"], shell=True)
|
||||
if status.returncode:
|
||||
raise Exception(
|
||||
"Failed to launch GraphDB. If you want to connect to an already running GraphDB instance"
|
||||
"then set LAUNCH_GRAPHDB in the script to False."
|
||||
)
|
||||
time.sleep(5)
|
||||
|
||||
# Initialize a knowledge graph connected to GraphDB and use "tutorial_10_index" as the name of the index
|
||||
kg = GraphDBKnowledgeGraph(index="tutorial_10_index")
|
||||
|
||||
# Initialize a in memory knowledge graph and use "tutorial_10_index" as the name of the index
|
||||
kg = InMemoryKnowledgeGraph(index="tutorial_10_index")
|
||||
# Delete the index as it might have been already created in previous runs
|
||||
kg.delete_index()
|
||||
|
||||
# Create the index based on a configuration file
|
||||
kg.create_index(config_path=Path(graph_dir + "repo-config.ttl"))
|
||||
|
||||
# Create the index
|
||||
kg.create_index()
|
||||
# Import triples of subject, predicate, and object statements from a ttl file
|
||||
kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir + "triples.ttl"))
|
||||
kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir) / "triples.ttl")
|
||||
print(f"The last triple stored in the knowledge graph is: {kg.get_all_triples()[-1]}")
|
||||
print(f"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.")
|
||||
|
||||
# Define prefixes for names of resources so that we can use shorter resource names in queries
|
||||
prefixes = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
||||
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
|
||||
PREFIX hp: <https://deepset.ai/harry_potter/>
|
||||
"""
|
||||
kg.prefixes = prefixes
|
||||
# ALTERNATIVE PATH USING GraphDB as knowledge graph
|
||||
# LAUNCH_GRAPHDB = os.environ.get("LAUNCH_GRAPHDB", True)
|
||||
# # Start a GraphDB server
|
||||
# if LAUNCH_GRAPHDB:
|
||||
# print("Starting GraphDB ...")
|
||||
# status = subprocess.run(
|
||||
# [
|
||||
# "docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11"
|
||||
# ],
|
||||
# shell=True,
|
||||
# )
|
||||
# if status.returncode:
|
||||
# status = subprocess.run(["docker start graphdb-instance-tutorial"], shell=True)
|
||||
# if status.returncode:
|
||||
# raise Exception(
|
||||
# "Failed to launch GraphDB. If you want to connect to an already running GraphDB instance"
|
||||
# "then set LAUNCH_GRAPHDB in the script to False."
|
||||
# )
|
||||
# time.sleep(5)
|
||||
# # Initialize a knowledge graph connected to GraphDB and use "tutorial_10_index" as the name of the index
|
||||
# kg = GraphDBKnowledgeGraph(index="tutorial_10_index")
|
||||
# # Delete the index as it might have been already created in previous runs
|
||||
# kg.delete_index()
|
||||
# # Create the index based on a configuration file
|
||||
# kg.create_index(config_path=Path(graph_dir + "repo-config.ttl"))
|
||||
# # Import triples of subject, predicate, and object statements from a ttl file
|
||||
# kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir + "triples.ttl"))
|
||||
# print(f"The last triple stored in the knowledge graph is: {kg.get_all_triples()[-1]}")
|
||||
# print(f"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.")
|
||||
# # Define prefixes for names of resources so that we can use shorter resource names in queries
|
||||
# prefixes = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
||||
# PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
|
||||
# PREFIX hp: <https://deepset.ai/harry_potter/>
|
||||
# """
|
||||
# kg.prefixes = prefixes
|
||||
|
||||
# Load a pre-trained model that translates text queries to SPARQL queries
|
||||
kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg, model_name_or_path=model_dir + "hp_v3.4")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user