haystack/tutorials/Tutorial10_Knowledge_Graph.py

import logging
import subprocess
import time
from pathlib import Path

from haystack.graph_retriever.text_to_sparql import Text2SparqlRetriever
from haystack.knowledge_graph.graphdb import GraphDBKnowledgeGraph
from haystack.preprocessor.utils import fetch_archive_from_http

logger = logging.getLogger(__name__)


def tutorial10_knowledge_graph():
    # Let's first fetch some triples that we want to store in our knowledge graph
    # Here: exemplary triples from the wizarding world
    graph_dir = "../data/tutorial10_knowledge_graph/"
    s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/triples_and_config.zip"
    fetch_archive_from_http(url=s3_url, output_dir=graph_dir)

    # Fetch a pre-trained BART model that translates text queries to SPARQL queries
    model_dir = "../saved_models/tutorial10_knowledge_graph/"
    s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/saved_models/hp_v3.4.zip"
    fetch_archive_from_http(url=s3_url, output_dir=model_dir)

    LAUNCH_GRAPHDB = True

    # Start a GraphDB server
    if LAUNCH_GRAPHDB:
        logging.info("Starting GraphDB ...")
        status = subprocess.run(
            ['docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11'], shell=True
        )
        if status.returncode:
            status = subprocess.run(
                [
                    'docker start graphdb-instance-tutorial'],
                shell=True
            )
            if status.returncode:
                raise Exception("Failed to launch GraphDB. If you want to connect to an already running GraphDB instance"
                            "then set LAUNCH_GRAPHDB in the script to False.")
        time.sleep(5)

    # Initialize a knowledge graph connected to GraphDB and use "tutorial_10_index" as the name of the index
    kg = GraphDBKnowledgeGraph(index="tutorial_10_index")

    # Delete the index as it might have been already created in previous runs
    kg.delete_index()

    # Create the index based on a configuration file
    kg.create_index(config_path=Path(graph_dir+"repo-config.ttl"))

    # Import triples of subject, predicate, and object statements from a ttl file
    kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir+"triples.ttl"))
    logging.info(f"The last triple stored in the knowledge graph is: {kg.get_all_triples()[-1]}")
    logging.info(f"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.")

    # Define prefixes for names of resources so that we can use shorter resource names in queries
    prefixes = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    PREFIX hp: <https://deepset.ai/harry_potter/>
    """
    kg.prefixes = prefixes

    # Load a pre-trained model that translates text queries to SPARQL queries
    kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg, model_name_or_path=model_dir+"hp_v3.4")

    # We can now ask questions that will be answered by our knowledge graph!
    # One limitation though: our pre-trained model can only generate questions about resources it has seen during training.
    # Otherwise, it cannot translate the name of the resource to the identifier used in the knowledge graph.
    # E.g. "Harry" -> "hp:Harry_potter"

    query = "In which house is Harry Potter?"
    logging.info(f"Translating the text query \"{query}\" to a SPARQL query and executing it on the knowledge graph...")
    result = kgqa_retriever.retrieve(query=query)
    logging.info(result)
    # Correct SPARQL query: select ?a { hp:Harry_potter hp:house ?a . }
    # Correct answer: Gryffindor

    logging.info("Executing a SPARQL query with prefixed names of resources...")
    result = kgqa_retriever._query_kg(sparql_query="select distinct ?sbj where { ?sbj hp:job hp:Keeper_of_keys_and_grounds . }")
    logging.info(result)
    # Paraphrased question: Who is the keeper of keys and grounds?
    # Correct answer: Rubeus Hagrid

    logging.info("Executing a SPARQL query with full names of resources...")
    result = kgqa_retriever._query_kg(sparql_query="select distinct ?obj where { <https://deepset.ai/harry_potter/Hermione_granger> <https://deepset.ai/harry_potter/patronus> ?obj . }")
    logging.info(result)
    # Paraphrased question: What is the patronus of Hermione?
    # Correct answer: Otter


if __name__ == "__main__":
    tutorial10_knowledge_graph()

# This Haystack script was made with love by deepset in Berlin, Germany
# Haystack: https://github.com/deepset-ai/haystack
# deepset: https://deepset.ai/
knowledge graph example (#934) * Add knowledge graph module * Fix type hint * Add graph retriver module * Change type annotations, change return format * Add graph retriever that executes questions as sparql queries * Linking only those entities that are in the knowledge graph * Added logging and using relations extracted from Knowledge graph for linking * Preventing entity linking from linking the same token to multiple entities * Pruning triples that have no variables for select and count queries * Support knowledge graphs with Pipelines * Add text2sparql * Entity linking and relation linking consider more special cases now based on evaluation on labelled data * Separating example code from KGQA implementation * Add eval on combined extarctive and kg questions * Remove references to hp-test * Add fields sparql_query and long_answer_list to metadata * Removing modular Question2SPARQL approach * Removing additional classes used for modular kgqa approach * preparing lcquad data * change graph db * Translating namespaces in knowledge graph queries * Creating graphdb index and loading triples from .ttl file * Fetching graph config files, triples and model from S3 * Fix incompatibility issues with BaseGraphRetriever and BaseComponent * Removing unused utility functions * Adding doc strings and tutorial header * Adding sparqlwrapper dependency * Moving tutorial header * Sorting tutorials by number within name of notebook * Add latest docstring and tutorial changes * Creating test cases for knowledge graph * Changing knowledge graph example to harry potter * Add latest docstring and tutorial changes * Adapting the tutorial notebook to harry potter example * Add GraphDB fixture for tests * Add latest docstring and tutorial changes * Added GraphDB docker launch to CI * Use correct GraphDB fixture * Check if GraphDB instance is already running * Renaming question/query and incorporating other feedback from Timo and Tanay * Removed type annotation * Add latest docstring and tutorial changes Co-authored-by: oryx1729 <oryx1729@protonmail.com> Co-authored-by: Timo Moeller <timo.moeller@deepset.ai> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2021-04-08 14:05:33 +02:00			`import logging`
			`import subprocess`
			`import time`
			`from pathlib import Path`

			`from haystack.graph_retriever.text_to_sparql import Text2SparqlRetriever`
			`from haystack.knowledge_graph.graphdb import GraphDBKnowledgeGraph`
			`from haystack.preprocessor.utils import fetch_archive_from_http`

			`logger = logging.getLogger(__name__)`


			`def tutorial10_knowledge_graph():`
			`# Let's first fetch some triples that we want to store in our knowledge graph`
			`# Here: exemplary triples from the wizarding world`
			`graph_dir = "../data/tutorial10_knowledge_graph/"`
			`s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/triples_and_config.zip"`
			`fetch_archive_from_http(url=s3_url, output_dir=graph_dir)`

			`# Fetch a pre-trained BART model that translates text queries to SPARQL queries`
			`model_dir = "../saved_models/tutorial10_knowledge_graph/"`
			`s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/saved_models/hp_v3.4.zip"`
			`fetch_archive_from_http(url=s3_url, output_dir=model_dir)`

			`LAUNCH_GRAPHDB = True`

			`# Start a GraphDB server`
			`if LAUNCH_GRAPHDB:`
			`logging.info("Starting GraphDB ...")`
			`status = subprocess.run(`
			`['docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11'], shell=True`
			`)`
			`if status.returncode:`
			`status = subprocess.run(`
			`[`
			`'docker start graphdb-instance-tutorial'],`
			`shell=True`
			`)`
			`if status.returncode:`
			`raise Exception("Failed to launch GraphDB. If you want to connect to an already running GraphDB instance"`
			`"then set LAUNCH_GRAPHDB in the script to False.")`
			`time.sleep(5)`

			`# Initialize a knowledge graph connected to GraphDB and use "tutorial_10_index" as the name of the index`
			`kg = GraphDBKnowledgeGraph(index="tutorial_10_index")`

			`# Delete the index as it might have been already created in previous runs`
			`kg.delete_index()`

			`# Create the index based on a configuration file`
			`kg.create_index(config_path=Path(graph_dir+"repo-config.ttl"))`

			`# Import triples of subject, predicate, and object statements from a ttl file`
			`kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir+"triples.ttl"))`
			`logging.info(f"The last triple stored in the knowledge graph is: {kg.get_all_triples()[-1]}")`
			`logging.info(f"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.")`

			`# Define prefixes for names of resources so that we can use shorter resource names in queries`
			`prefixes = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>`
			`PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>`
			`PREFIX hp: <https://deepset.ai/harry_potter/>`
			`"""`
			`kg.prefixes = prefixes`

			`# Load a pre-trained model that translates text queries to SPARQL queries`
			`kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg, model_name_or_path=model_dir+"hp_v3.4")`

			`# We can now ask questions that will be answered by our knowledge graph!`
			`# One limitation though: our pre-trained model can only generate questions about resources it has seen during training.`
			`# Otherwise, it cannot translate the name of the resource to the identifier used in the knowledge graph.`
			`# E.g. "Harry" -> "hp:Harry_potter"`

			`query = "In which house is Harry Potter?"`
			`logging.info(f"Translating the text query \"{query}\" to a SPARQL query and executing it on the knowledge graph...")`
			`result = kgqa_retriever.retrieve(query=query)`
			`logging.info(result)`
			`# Correct SPARQL query: select ?a { hp:Harry_potter hp:house ?a . }`
			`# Correct answer: Gryffindor`

			`logging.info("Executing a SPARQL query with prefixed names of resources...")`
			`result = kgqa_retriever._query_kg(sparql_query="select distinct ?sbj where { ?sbj hp:job hp:Keeper_of_keys_and_grounds . }")`
			`logging.info(result)`
			`# Paraphrased question: Who is the keeper of keys and grounds?`
			`# Correct answer: Rubeus Hagrid`

			`logging.info("Executing a SPARQL query with full names of resources...")`
			`result = kgqa_retriever._query_kg(sparql_query="select distinct ?obj where { <https://deepset.ai/harry_potter/Hermione_granger> <https://deepset.ai/harry_potter/patronus> ?obj . }")`
			`logging.info(result)`
			`# Paraphrased question: What is the patronus of Hermione?`
			`# Correct answer: Otter`


			`if __name__ == "__main__":`
			`tutorial10_knowledge_graph()`
Tutorial update (#1166) * Add header / footer * Add Milvus example * Generate md files * Fix mypy CI 2021-06-11 11:09:15 +02:00
			`# This Haystack script was made with love by deepset in Berlin, Germany`
			`# Haystack: https://github.com/deepset-ai/haystack`
			`# deepset: https://deepset.ai/`