diff --git a/docs/_src/tutorials/tutorials/10.md b/docs/_src/tutorials/tutorials/10.md index 81c2b9998..af1b586a3 100644 --- a/docs/_src/tutorials/tutorials/10.md +++ b/docs/_src/tutorials/tutorials/10.md @@ -22,7 +22,7 @@ The training of models that translate text queries into SPARQL queries is curren # Install the latest master of Haystack !pip install --upgrade pip -!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,graphdb] +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,inmemorygraph] ``` @@ -34,7 +34,7 @@ import time from pathlib import Path from haystack.nodes import Text2SparqlRetriever -from haystack.document_stores import GraphDBKnowledgeGraph +from haystack.document_stores import InMemoryKnowledgeGraph from haystack.utils import fetch_archive_from_http ``` @@ -54,44 +54,24 @@ s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/saved_models/hp_v3.4.zip" fetch_archive_from_http(url=s3_url, output_dir=model_dir) ``` -## Launching a GraphDB instance +## Initialize a knowledge graph and load data + +Currently, Haystack supports two alternative implementations for knowledge graphs: +* simple InMemoryKnowledgeGraph (based on RDFLib in-memory store) +* GraphDBKnowledgeGraph, which runs on GraphDB. + +### InMemoryKnowledgeGraph ```python -# Unfortunately, there seems to be no good way to run GraphDB in colab environments -# In your local environment, you could start a GraphDB server with docker -# Feel free to check GraphDB's website for the free version https://www.ontotext.com/products/graphdb/graphdb-free/ -import os - -LAUNCH_GRAPHDB = os.environ.get("LAUNCH_GRAPHDB", False) - -if LAUNCH_GRAPHDB: - print("Starting GraphDB ...") - status = subprocess.run( - [ - "docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11" - ], - shell=True, - ) - if status.returncode: - raise Exception( - "Failed to launch GraphDB. Maybe it is already running or you already have a container with that name that you could start?" - ) - time.sleep(5) -``` - -## Creating a new GraphDB repository (also known as index in haystack's document stores) - - -```python -# Initialize a knowledge graph connected to GraphDB and use "tutorial_10_index" as the name of the index -kg = GraphDBKnowledgeGraph(index="tutorial_10_index") +# Initialize a in memory knowledge graph and use "tutorial_10_index" as the name of the index +kg = InMemoryKnowledgeGraph(index="tutorial_10_index") # Delete the index as it might have been already created in previous runs kg.delete_index() -# Create the index based on a configuration file -kg.create_index(config_path=Path(graph_dir) / "repo-config.ttl") +# Create the index +kg.create_index() # Import triples of subject, predicate, and object statements from a ttl file kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir) / "triples.ttl") @@ -99,15 +79,69 @@ print(f"The last triple stored in the knowledge graph is: {kg.get_all_triples()[ print(f"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.") ``` +### GraphDBKnowledgeGraph (alternative) + +#### Launching a GraphDB instance + ```python -# Define prefixes for names of resources so that we can use shorter resource names in queries -prefixes = """PREFIX rdf: -PREFIX xsd: -PREFIX hp: -""" -kg.prefixes = prefixes +# # Unfortunately, there seems to be no good way to run GraphDB in colab environments +# # In your local environment, you could start a GraphDB server with docker +# # Feel free to check GraphDB's website for the free version https://www.ontotext.com/products/graphdb/graphdb-free/ +# import os +# LAUNCH_GRAPHDB = os.environ.get("LAUNCH_GRAPHDB", False) + +# if LAUNCH_GRAPHDB: +# print("Starting GraphDB ...") +# status = subprocess.run( +# [ +# "docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11" +# ], +# shell=True, +# ) +# if status.returncode: +# raise Exception( +# "Failed to launch GraphDB. Maybe it is already running or you already have a container with that name that you could start?" +# ) +# time.sleep(5) +``` + +#### Creating a new GraphDB repository (also known as index in haystack's document stores) + + +```python +# from haystack.document_stores import GraphDBKnowledgeGraph + +# # Initialize a knowledge graph connected to GraphDB and use "tutorial_10_index" as the name of the index +# kg = GraphDBKnowledgeGraph(index="tutorial_10_index") + +# # Delete the index as it might have been already created in previous runs +# kg.delete_index() + +# # Create the index based on a configuration file +# kg.create_index(config_path=Path(graph_dir) / "repo-config.ttl") + +# # Import triples of subject, predicate, and object statements from a ttl file +# kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir) / "triples.ttl") +# print(f"The last triple stored in the knowledge graph is: {kg.get_all_triples()[-1]}") +# print(f"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.") +``` + + +```python +# # Define prefixes for names of resources so that we can use shorter resource names in queries +# prefixes = """PREFIX rdf: +# PREFIX xsd: +# PREFIX hp: +# """ +# kg.prefixes = prefixes +``` + +## Load the pre-trained retriever + + +```python # Load a pre-trained model that translates text queries to SPARQL queries kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg, model_name_or_path=Path(model_dir) / "hp_v3.4") ``` diff --git a/haystack/document_stores/__init__.py b/haystack/document_stores/__init__.py index a44f7fd1e..1491a2f8d 100644 --- a/haystack/document_stores/__init__.py +++ b/haystack/document_stores/__init__.py @@ -23,3 +23,6 @@ else: MilvusDocumentStore = safe_import("haystack.document_stores.milvus2", "Milvus2DocumentStore", "milvus") WeaviateDocumentStore = safe_import("haystack.document_stores.weaviate", "WeaviateDocumentStore", "weaviate") GraphDBKnowledgeGraph = safe_import("haystack.document_stores.graphdb", "GraphDBKnowledgeGraph", "graphdb") +InMemoryKnowledgeGraph = safe_import( + "haystack.document_stores.memory_knowledgegraph", "InMemoryKnowledgeGraph", "inmemorygraph" +) diff --git a/haystack/document_stores/memory_knowledgegraph.py b/haystack/document_stores/memory_knowledgegraph.py new file mode 100644 index 000000000..6108d1025 --- /dev/null +++ b/haystack/document_stores/memory_knowledgegraph.py @@ -0,0 +1,137 @@ +from typing import Dict, Optional + +import logging +from collections import defaultdict +from pathlib import Path + +from rdflib import Graph + +from haystack.document_stores import BaseKnowledgeGraph + +logger = logging.getLogger(__name__) + + +class InMemoryKnowledgeGraph(BaseKnowledgeGraph): + """ + In memory Knowledge graph store, based on rdflib. + """ + + def __init__(self, index: str = "document"): + """ + Init the in memory knowledge graph + + :param index: name of the index + """ + super().__init__() + + self.indexes: Dict[str, Graph] = defaultdict(dict) + self.index: str = index + + def create_index(self, index: Optional[str] = None): + """ + Create a new index stored in memory + + :param index: name of the index + """ + index = index or self.index + if index not in self.indexes: + self.indexes[index] = Graph() + else: + logger.warning(f"Index '{index}' is already present.") + + def delete_index(self, index: Optional[str] = None): + """ + Delete an existing index. The index including all data will be removed. + + :param index: The name of the index to delete. + """ + index = index or self.index + + if index in self.indexes: + del self.indexes[index] + logger.info(f"Index '{index}' deleted.") + + def import_from_ttl_file(self, path: Path, index: Optional[str] = None): + """ + Load in memory an existing knowledge graph represented in the form of triples of subject, predicate, and object from a .ttl file + + :param path: path to a .ttl containing a knowledge graph + :param index: name of the index + """ + index = index or self.index + self.indexes[index].parse(path) + + def get_all_triples(self, index: Optional[str] = None): + """ + Query the given in memory index for all its stored triples. Duplicates are not filtered. + + :param index: name of the index + :return: all triples stored in the index + """ + sparql_query = "SELECT * WHERE { ?s ?p ?o. }" + results = self.query(sparql_query=sparql_query, index=index) + return results + + def get_all_subjects(self, index: Optional[str] = None): + """ + Query the given in memory index for all its stored subjects. Duplicates are not filtered. + + :param index: name of the index + :return: all subjects stored in the index + """ + sparql_query = "SELECT ?s WHERE { ?s ?p ?o. }" + results = self.query(sparql_query=sparql_query, index=index) + return results + + def get_all_predicates(self, index: Optional[str] = None): + """ + Query the given in memory index for all its stored predicates. Duplicates are not filtered. + + :param index: name of the index + :return: all predicates stored in the index + """ + sparql_query = "SELECT ?p WHERE { ?s ?p ?o. }" + results = self.query(sparql_query=sparql_query, index=index) + return results + + def _create_document_field_map(self) -> Dict: + """ + There is no field mapping required + """ + return {} + + def get_all_objects(self, index: Optional[str] = None): + """ + Query the given in memory index for all its stored objects. Duplicates are not filtered. + + :param index: name of the index + :return: all objects stored in the index + """ + sparql_query = "SELECT ?o WHERE { ?s ?p ?o. }" + results = self.query(sparql_query=sparql_query, index=index) + return results + + def query(self, sparql_query: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None): + """ + Execute a SPARQL query on the given in memory index + + :param sparql_query: SPARQL query that shall be executed + :param index: name of the index + :return: query result + """ + index = index or self.index + raw_results = self.indexes[index].query(sparql_query) + + if raw_results.askAnswer is not None: + return raw_results.askAnswer + else: + formatted_results = [] + for b in raw_results.bindings: + formatted_result = {} + items = list(b.items()) + for item in items: + type_ = item[0].toPython()[1:] + uri = item[1].toPython() + formatted_result[type_] = {"type": "uri", "value": uri} + formatted_results.append(formatted_result) + return formatted_results diff --git a/haystack/json-schemas/haystack-pipeline-master.schema.json b/haystack/json-schemas/haystack-pipeline-master.schema.json index 975ec0959..ff5511656 100644 --- a/haystack/json-schemas/haystack-pipeline-master.schema.json +++ b/haystack/json-schemas/haystack-pipeline-master.schema.json @@ -40,6 +40,9 @@ { "$ref": "#/definitions/InMemoryDocumentStoreComponent" }, + { + "$ref": "#/definitions/InMemoryKnowledgeGraphComponent" + }, { "$ref": "#/definitions/Milvus2DocumentStoreComponent" }, @@ -845,6 +848,40 @@ ], "additionalProperties": false }, + "InMemoryKnowledgeGraphComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "InMemoryKnowledgeGraph" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "index": { + "title": "Index", + "default": "document", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, "Milvus2DocumentStoreComponent": { "type": "object", "properties": { diff --git a/setup.cfg b/setup.cfg index c5ff30269..c92a76ee7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -146,10 +146,12 @@ pinecone = farm-haystack[sql,only-pinecone] graphdb = SPARQLWrapper +inmemorygraph = + SPARQLWrapper docstores = - farm-haystack[faiss,milvus,weaviate,graphdb,pinecone] + farm-haystack[faiss,milvus,weaviate,graphdb,inmemorygraph,pinecone] docstores-gpu = - farm-haystack[faiss-gpu,milvus,weaviate,graphdb,pinecone] + farm-haystack[faiss-gpu,milvus,weaviate,graphdb,inmemorygraph,pinecone] audio = espnet diff --git a/test/document_stores/test_knowledge_graph.py b/test/document_stores/test_knowledge_graph.py index 4157b106d..6e71dcacb 100644 --- a/test/document_stores/test_knowledge_graph.py +++ b/test/document_stores/test_knowledge_graph.py @@ -3,7 +3,7 @@ from pathlib import Path import pytest from haystack.nodes import Text2SparqlRetriever -from haystack.document_stores import GraphDBKnowledgeGraph +from haystack.document_stores import GraphDBKnowledgeGraph, InMemoryKnowledgeGraph from haystack.utils import fetch_archive_from_http @@ -60,3 +60,50 @@ def test_graph_retrieval(): sparql_query="select distinct ?obj where { ?obj . }" ) assert result[0][0] == "https://deepset.ai/harry_potter/Otter" + + +@pytest.mark.integration +def test_inmemory_graph_retrieval(): + # TODO rename doc_dir + graph_dir = "../data/tutorial10_knowledge_graph/" + s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/triples_and_config.zip" + fetch_archive_from_http(url=s3_url, output_dir=graph_dir) + + # Fetch a pre-trained BART model that translates natural language questions to SPARQL queries + model_dir = "../saved_models/tutorial10_knowledge_graph/" + s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/saved_models/hp_v3.4.zip" + fetch_archive_from_http(url=s3_url, output_dir=model_dir) + + kg = InMemoryKnowledgeGraph(index="tutorial_10_index") + kg.delete_index() + kg.create_index() + kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir + "triples.ttl")) + triple = { + "p": {"type": "uri", "value": "https://deepset.ai/harry_potter/_paternalgrandfather"}, + "s": {"type": "uri", "value": "https://deepset.ai/harry_potter/Melody_fawley"}, + "o": {"type": "uri", "value": "https://deepset.ai/harry_potter/Marshall_fawley"}, + } + triples = kg.get_all_triples() + assert len(triples) > 0 + assert triple in triples + + kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg, model_name_or_path=model_dir + "hp_v3.4") + + result = kgqa_retriever.retrieve(query="In which house is Harry Potter?") + assert result[0] == { + "answer": ["https://deepset.ai/harry_potter/Gryffindor"], + "prediction_meta": { + "model": "Text2SparqlRetriever", + "sparql_query": "select ?a { hp:Harry_potter hp:house ?a . }", + }, + } + + result = kgqa_retriever._query_kg( + sparql_query="select distinct ?sbj where { ?sbj hp:job hp:Keeper_of_keys_and_grounds . }" + ) + assert result[0][0] == "https://deepset.ai/harry_potter/Rubeus_hagrid" + + result = kgqa_retriever._query_kg( + sparql_query="select distinct ?obj where { ?obj . }" + ) + assert result[0][0] == "https://deepset.ai/harry_potter/Otter" diff --git a/tutorials/Tutorial10_Knowledge_Graph.ipynb b/tutorials/Tutorial10_Knowledge_Graph.ipynb index 50ef362e2..e40e04d00 100644 --- a/tutorials/Tutorial10_Knowledge_Graph.ipynb +++ b/tutorials/Tutorial10_Knowledge_Graph.ipynb @@ -3,7 +3,6 @@ { "cell_type": "markdown", "metadata": { - "collapsed": false, "pycharm": { "name": "#%% md\n" } @@ -23,6 +22,9 @@ "execution_count": null, "metadata": { "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, "pycharm": { "name": "#%%\n" } @@ -34,7 +36,7 @@ "\n", "# Install the latest master of Haystack\n", "!pip install --upgrade pip\n", - "!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,graphdb]" + "!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,inmemorygraph]" ] }, { @@ -42,6 +44,9 @@ "execution_count": null, "metadata": { "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, "pycharm": { "name": "#%%\n" } @@ -55,14 +60,13 @@ "from pathlib import Path\n", "\n", "from haystack.nodes import Text2SparqlRetriever\n", - "from haystack.document_stores import GraphDBKnowledgeGraph\n", + "from haystack.document_stores import InMemoryKnowledgeGraph\n", "from haystack.utils import fetch_archive_from_http" ] }, { "cell_type": "markdown", "metadata": { - "collapsed": false, "pycharm": { "name": "#%% md\n" } @@ -76,6 +80,9 @@ "execution_count": null, "metadata": { "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, "pycharm": { "name": "#%%\n" } @@ -96,80 +103,50 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - }, + "metadata": {}, "source": [ - "## Launching a GraphDB instance" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "# Unfortunately, there seems to be no good way to run GraphDB in colab environments\n", - "# In your local environment, you could start a GraphDB server with docker\n", - "# Feel free to check GraphDB's website for the free version https://www.ontotext.com/products/graphdb/graphdb-free/\n", - "import os\n", - "\n", - "LAUNCH_GRAPHDB = os.environ.get(\"LAUNCH_GRAPHDB\", False)\n", - "\n", - "if LAUNCH_GRAPHDB:\n", - " print(\"Starting GraphDB ...\")\n", - " status = subprocess.run(\n", - " [\n", - " \"docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11\"\n", - " ],\n", - " shell=True,\n", - " )\n", - " if status.returncode:\n", - " raise Exception(\n", - " \"Failed to launch GraphDB. Maybe it is already running or you already have a container with that name that you could start?\"\n", - " )\n", - " time.sleep(5)" + "## Initialize a knowledge graph and load data" ] }, { "cell_type": "markdown", - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - }, + "metadata": {}, "source": [ - "## Creating a new GraphDB repository (also known as index in haystack's document stores)" + "Currently, Haystack supports two alternative implementations for knowledge graphs:\n", + "* simple InMemoryKnowledgeGraph (based on RDFLib in-memory store)\n", + "* GraphDBKnowledgeGraph, which runs on GraphDB." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### InMemoryKnowledgeGraph " ] }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The last triple stored in the knowledge graph is: {'s': {'type': 'uri', 'value': 'https://deepset.ai/harry_potter/Harry_potter'}, 'p': {'type': 'uri', 'value': 'https://deepset.ai/harry_potter/family'}, 'o': {'type': 'uri', 'value': 'https://deepset.ai/harry_potter/Dudley_dursleys_children'}}\n", + "There are 118543 triples stored in the knowledge graph.\n" + ] } - }, - "outputs": [], + ], "source": [ - "# Initialize a knowledge graph connected to GraphDB and use \"tutorial_10_index\" as the name of the index\n", - "kg = GraphDBKnowledgeGraph(index=\"tutorial_10_index\")\n", + "# Initialize a in memory knowledge graph and use \"tutorial_10_index\" as the name of the index\n", + "kg = InMemoryKnowledgeGraph(index=\"tutorial_10_index\")\n", "\n", "# Delete the index as it might have been already created in previous runs\n", "kg.delete_index()\n", "\n", - "# Create the index based on a configuration file\n", - "kg.create_index(config_path=Path(graph_dir) / \"repo-config.ttl\")\n", + "# Create the index\n", + "kg.create_index()\n", "\n", "# Import triples of subject, predicate, and object statements from a ttl file\n", "kg.import_from_ttl_file(index=\"tutorial_10_index\", path=Path(graph_dir) / \"triples.ttl\")\n", @@ -177,24 +154,140 @@ "print(f\"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.\")" ] }, + { + "cell_type": "markdown", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "### GraphDBKnowledgeGraph (alternative)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "#### Launching a GraphDB instance" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ - "# Define prefixes for names of resources so that we can use shorter resource names in queries\n", - "prefixes = \"\"\"PREFIX rdf: \n", - "PREFIX xsd: \n", - "PREFIX hp: \n", - "\"\"\"\n", - "kg.prefixes = prefixes\n", + "# # Unfortunately, there seems to be no good way to run GraphDB in colab environments\n", + "# # In your local environment, you could start a GraphDB server with docker\n", + "# # Feel free to check GraphDB's website for the free version https://www.ontotext.com/products/graphdb/graphdb-free/\n", + "# import os\n", "\n", + "# LAUNCH_GRAPHDB = os.environ.get(\"LAUNCH_GRAPHDB\", False)\n", + "\n", + "# if LAUNCH_GRAPHDB:\n", + "# print(\"Starting GraphDB ...\")\n", + "# status = subprocess.run(\n", + "# [\n", + "# \"docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11\"\n", + "# ],\n", + "# shell=True,\n", + "# )\n", + "# if status.returncode:\n", + "# raise Exception(\n", + "# \"Failed to launch GraphDB. Maybe it is already running or you already have a container with that name that you could start?\"\n", + "# )\n", + "# time.sleep(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "#### Creating a new GraphDB repository (also known as index in haystack's document stores)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# from haystack.document_stores import GraphDBKnowledgeGraph\n", + "\n", + "# # Initialize a knowledge graph connected to GraphDB and use \"tutorial_10_index\" as the name of the index\n", + "# kg = GraphDBKnowledgeGraph(index=\"tutorial_10_index\")\n", + "\n", + "# # Delete the index as it might have been already created in previous runs\n", + "# kg.delete_index()\n", + "\n", + "# # Create the index based on a configuration file\n", + "# kg.create_index(config_path=Path(graph_dir) / \"repo-config.ttl\")\n", + "\n", + "# # Import triples of subject, predicate, and object statements from a ttl file\n", + "# kg.import_from_ttl_file(index=\"tutorial_10_index\", path=Path(graph_dir) / \"triples.ttl\")\n", + "# print(f\"The last triple stored in the knowledge graph is: {kg.get_all_triples()[-1]}\")\n", + "# print(f\"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# # Define prefixes for names of resources so that we can use shorter resource names in queries\n", + "# prefixes = \"\"\"PREFIX rdf: \n", + "# PREFIX xsd: \n", + "# PREFIX hp: \n", + "# \"\"\"\n", + "# kg.prefixes = prefixes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the pre-trained retriever" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ "# Load a pre-trained model that translates text queries to SPARQL queries\n", "kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg, model_name_or_path=Path(model_dir) / \"hp_v3.4\")" ] @@ -202,7 +295,6 @@ { "cell_type": "markdown", "metadata": { - "collapsed": false, "pycharm": { "name": "#%% md\n" } @@ -218,14 +310,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, "pycharm": { "name": "#%%\n" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Translating the text query \"In which house is Harry Potter?\" to a SPARQL query and executing it on the knowledge graph...\n", + "[{'answer': ['https://deepset.ai/harry_potter/Gryffindor'], 'prediction_meta': {'model': 'Text2SparqlRetriever', 'sparql_query': 'select ?a { hp:Harry_potter hp:house ?a . }'}}]\n", + "Executing a SPARQL query with prefixed names of resources...\n", + "(['https://deepset.ai/harry_potter/Rubeus_hagrid', 'https://deepset.ai/harry_potter/Ogg'], 'select distinct ?sbj where { ?sbj hp:job hp:Keeper_of_keys_and_grounds . }')\n", + "Executing a SPARQL query with full names of resources...\n", + "(['https://deepset.ai/harry_potter/Otter'], 'select distinct ?obj where { ?obj . }')\n" + ] + } + ], "source": [ "query = \"In which house is Harry Potter?\"\n", "print(f'Translating the text query \"{query}\" to a SPARQL query and executing it on the knowledge graph...')\n", @@ -253,9 +361,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "## About us\n", "\n", @@ -278,23 +384,28 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.10.4" + }, + "vscode": { + "interpreter": { + "hash": "d6fc774dec8e6d4d8b6a5562b41269a570ea5456d1c03f28da35966a9134f033" + } } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/tutorials/Tutorial10_Knowledge_Graph.py b/tutorials/Tutorial10_Knowledge_Graph.py index e9d7c6f74..ebc696561 100644 --- a/tutorials/Tutorial10_Knowledge_Graph.py +++ b/tutorials/Tutorial10_Knowledge_Graph.py @@ -5,10 +5,9 @@ import time from pathlib import Path from haystack.nodes import Text2SparqlRetriever -from haystack.document_stores import GraphDBKnowledgeGraph +from haystack.document_stores import GraphDBKnowledgeGraph, InMemoryKnowledgeGraph from haystack.utils import fetch_archive_from_http - logger = logging.getLogger(__name__) @@ -24,46 +23,52 @@ def tutorial10_knowledge_graph(): s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/saved_models/hp_v3.4.zip" fetch_archive_from_http(url=s3_url, output_dir=model_dir) - LAUNCH_GRAPHDB = os.environ.get("LAUNCH_GRAPHDB", True) - - # Start a GraphDB server - if LAUNCH_GRAPHDB: - print("Starting GraphDB ...") - status = subprocess.run( - [ - "docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11" - ], - shell=True, - ) - if status.returncode: - status = subprocess.run(["docker start graphdb-instance-tutorial"], shell=True) - if status.returncode: - raise Exception( - "Failed to launch GraphDB. If you want to connect to an already running GraphDB instance" - "then set LAUNCH_GRAPHDB in the script to False." - ) - time.sleep(5) - - # Initialize a knowledge graph connected to GraphDB and use "tutorial_10_index" as the name of the index - kg = GraphDBKnowledgeGraph(index="tutorial_10_index") - + # Initialize a in memory knowledge graph and use "tutorial_10_index" as the name of the index + kg = InMemoryKnowledgeGraph(index="tutorial_10_index") # Delete the index as it might have been already created in previous runs kg.delete_index() - - # Create the index based on a configuration file - kg.create_index(config_path=Path(graph_dir + "repo-config.ttl")) - + # Create the index + kg.create_index() # Import triples of subject, predicate, and object statements from a ttl file - kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir + "triples.ttl")) + kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir) / "triples.ttl") print(f"The last triple stored in the knowledge graph is: {kg.get_all_triples()[-1]}") print(f"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.") - # Define prefixes for names of resources so that we can use shorter resource names in queries - prefixes = """PREFIX rdf: - PREFIX xsd: - PREFIX hp: - """ - kg.prefixes = prefixes + # ALTERNATIVE PATH USING GraphDB as knowledge graph + # LAUNCH_GRAPHDB = os.environ.get("LAUNCH_GRAPHDB", True) + # # Start a GraphDB server + # if LAUNCH_GRAPHDB: + # print("Starting GraphDB ...") + # status = subprocess.run( + # [ + # "docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11" + # ], + # shell=True, + # ) + # if status.returncode: + # status = subprocess.run(["docker start graphdb-instance-tutorial"], shell=True) + # if status.returncode: + # raise Exception( + # "Failed to launch GraphDB. If you want to connect to an already running GraphDB instance" + # "then set LAUNCH_GRAPHDB in the script to False." + # ) + # time.sleep(5) + # # Initialize a knowledge graph connected to GraphDB and use "tutorial_10_index" as the name of the index + # kg = GraphDBKnowledgeGraph(index="tutorial_10_index") + # # Delete the index as it might have been already created in previous runs + # kg.delete_index() + # # Create the index based on a configuration file + # kg.create_index(config_path=Path(graph_dir + "repo-config.ttl")) + # # Import triples of subject, predicate, and object statements from a ttl file + # kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir + "triples.ttl")) + # print(f"The last triple stored in the knowledge graph is: {kg.get_all_triples()[-1]}") + # print(f"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.") + # # Define prefixes for names of resources so that we can use shorter resource names in queries + # prefixes = """PREFIX rdf: + # PREFIX xsd: + # PREFIX hp: + # """ + # kg.prefixes = prefixes # Load a pre-trained model that translates text queries to SPARQL queries kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg, model_name_or_path=model_dir + "hp_v3.4")