haystack/tutorials/Tutorial10_Knowledge_Graph.py

import logging

# We configure how logging messages should be displayed and which log level should be used before importing Haystack.
# Example log message:
# INFO - haystack.utils.preprocessing -  Converting data/tutorial1/218_Olenna_Tyrell.txt
# Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily:
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

import os
import subprocess
import time
from pathlib import Path

from haystack.nodes import Text2SparqlRetriever
from haystack.document_stores import GraphDBKnowledgeGraph, InMemoryKnowledgeGraph
from haystack.utils import fetch_archive_from_http


def tutorial10_knowledge_graph():
    # Let's first fetch some triples that we want to store in our knowledge graph
    # Here: exemplary triples from the wizarding world
    graph_dir = "data/tutorial10/"
    s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/triples_and_config.zip"
    fetch_archive_from_http(url=s3_url, output_dir=graph_dir)

    # Fetch a pre-trained BART model that translates text queries to SPARQL queries
    model_dir = "../saved_models/tutorial10_knowledge_graph/"
    s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/saved_models/hp_v3.4.zip"
    fetch_archive_from_http(url=s3_url, output_dir=model_dir)

    # Initialize a in memory knowledge graph and use "tutorial_10_index" as the name of the index
    kg = InMemoryKnowledgeGraph(index="tutorial_10_index")
    # Delete the index as it might have been already created in previous runs
    kg.delete_index()
    # Create the index
    kg.create_index()
    # Import triples of subject, predicate, and object statements from a ttl file
    kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir) / "triples.ttl")
    print(f"The last triple stored in the knowledge graph is: {kg.get_all_triples()[-1]}")
    print(f"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.")

    # ALTERNATIVE PATH USING GraphDB as knowledge graph
    # LAUNCH_GRAPHDB = os.environ.get("LAUNCH_GRAPHDB", True)
    # # Start a GraphDB server
    # if LAUNCH_GRAPHDB:
    #     print("Starting GraphDB ...")
    #     status = subprocess.run(
    #         [
    #             "docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11"
    #         ],
    #         shell=True,
    #     )
    #     if status.returncode:
    #         status = subprocess.run(["docker start graphdb-instance-tutorial"], shell=True)
    #         if status.returncode:
    #             raise Exception(
    #                 "Failed to launch GraphDB. If you want to connect to an already running GraphDB instance"
    #                 "then set LAUNCH_GRAPHDB in the script to False."
    #             )
    #     time.sleep(5)
    # # Initialize a knowledge graph connected to GraphDB and use "tutorial_10_index" as the name of the index
    # kg = GraphDBKnowledgeGraph(index="tutorial_10_index")
    # # Delete the index as it might have been already created in previous runs
    # kg.delete_index()
    # # Create the index based on a configuration file
    # kg.create_index(config_path=Path(graph_dir + "repo-config.ttl"))
    # # Import triples of subject, predicate, and object statements from a ttl file
    # kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir + "triples.ttl"))
    # print(f"The last triple stored in the knowledge graph is: {kg.get_all_triples()[-1]}")
    # print(f"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.")
    # # Define prefixes for names of resources so that we can use shorter resource names in queries
    # prefixes = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    # PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    # PREFIX hp: <https://deepset.ai/harry_potter/>
    # """
    # kg.prefixes = prefixes

    # Load a pre-trained model that translates text queries to SPARQL queries
    kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg, model_name_or_path=model_dir + "hp_v3.4")

    # We can now ask questions that will be answered by our knowledge graph!
    # One limitation though: our pre-trained model can only generate questions about resources it has seen during training.
    # Otherwise, it cannot translate the name of the resource to the identifier used in the knowledge graph.
    # E.g. "Harry" -> "hp:Harry_potter"

    query = "In which house is Harry Potter?"
    print(f'Translating the text query "{query}" to a SPARQL query and executing it on the knowledge graph...')
    result = kgqa_retriever.retrieve(query=query)
    print(result)
    # Correct SPARQL query: select ?a { hp:Harry_potter hp:house ?a . }
    # Correct answer: Gryffindor

    print("Executing a SPARQL query with prefixed names of resources...")
    result = kgqa_retriever._query_kg(
        sparql_query="select distinct ?sbj where { ?sbj hp:job hp:Keeper_of_keys_and_grounds . }"
    )
    print(result)
    # Paraphrased question: Who is the keeper of keys and grounds?
    # Correct answer: Rubeus Hagrid

    print("Executing a SPARQL query with full names of resources...")
    result = kgqa_retriever._query_kg(
        sparql_query="select distinct ?obj where { <https://deepset.ai/harry_potter/Hermione_granger> <https://deepset.ai/harry_potter/patronus> ?obj . }"
    )
    print(result)
    # Paraphrased question: What is the patronus of Hermione?
    # Correct answer: Otter


if __name__ == "__main__":
    tutorial10_knowledge_graph()

# This Haystack script was made with love by deepset in Berlin, Germany
# Haystack: https://github.com/deepset-ai/haystack
# deepset: https://deepset.ai/