mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-27 10:49:52 +00:00

* Experimental Ci workflow for running tutorials * Run on every push for now * Not starting? * Disabling paths temporarily * Sort tutorials in natural order * Install ipython * remove ipython install * Try running ipython with sudo * env.pythonLocation * Skipping tutorial2 and 9 for speed * typo * Use one runner per tutorial, for now * Typo in dependend job * Missing quotes broke scripts matrix * Simplify setup for the tutorials, try to prevent containers conflict * Remove needless job dependencies * Try prevent cache issues, fix small Tut10 bug * Missing deps for running notebook tutorials * Create three groups of tutorials excluding the longest among them * remove deps * use proper bash loop * Try with a single string * Fix typo in echo * Forgot do * Typo * Try to make the GraphDB tutorial without launching its own container * Run notebook and script together * Whitespace * separate scrpits and notebooks execution * Run notebooks first * Try caching the GoT data before running the scripts * add note * fix mkdir * Fix path * Update Documentation & Code Style * missing -r * Fix folder numbering * Run notebooks as well * Typo in notebook command * complete path in notebook command * Try with TIKA_LOG_PATH * Fix folder naming * Do not use cached data in Tut9 * extracting the number better * Small tweaks * Same fix on Tut10 on the notebook * Exclude GoT cache for tut5 too * Remove faiss files after tutorial run * Layout * fix remove command * Fix path in tut10 notebook * Fix typo in node name in tut14 * Third block was too long, rebancing * Reduce GoT dataset even more, why wasting time after all... * Fix paths in tut10 again * do git clean to make sure to cleanup everything (breaks post Python) * Remove ES file with bad permission at the end of the run * Split first block, takes >30mins * take out tut15 for a moment, has an actual bug * typo * Forgot rm option * Simply remove all ES files * Improve logs of GoT reduction * Exclude also tut16 from cache to try fix bug * Replace ll with ls * Reintroduce 15_TableQA * Small regrouping * regrouping to make the min num of runners go for about 30mins * Add cron schedule and PR paths conditions * Add some timing information * Separate tutorials by diff and tutorials by cron * temp add pull_request to tutorials nightly * Add badge in README to keep track of the nightly tutorials run * Remove prefixes from data folder names * Add fetch depth to get diff with master * Fix paths again * typo * Exclude long-running ones * Typo * Fix tutorials.yml as well * Use head_ref * Using an action for now * exclude other files * Use only the correct command to run the tutorial * Add long running tutorials in separate runners, just for experiment * Factor out the complex bash script * Pass the python path to the bash script * Fix paths * adding log statement * Missing dollarsign * Resetting variable in loop * using mini GoT dataset and improving bash script * change dataset name Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
106 lines
4.5 KiB
Python
106 lines
4.5 KiB
Python
import os
|
|
import logging
|
|
import subprocess
|
|
import time
|
|
from pathlib import Path
|
|
|
|
from haystack.nodes import Text2SparqlRetriever
|
|
from haystack.document_stores import GraphDBKnowledgeGraph
|
|
from haystack.utils import fetch_archive_from_http
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def tutorial10_knowledge_graph():
|
|
# Let's first fetch some triples that we want to store in our knowledge graph
|
|
# Here: exemplary triples from the wizarding world
|
|
graph_dir = "data/tutorial10/"
|
|
s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/triples_and_config.zip"
|
|
fetch_archive_from_http(url=s3_url, output_dir=graph_dir)
|
|
|
|
# Fetch a pre-trained BART model that translates text queries to SPARQL queries
|
|
model_dir = "../saved_models/tutorial10_knowledge_graph/"
|
|
s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/saved_models/hp_v3.4.zip"
|
|
fetch_archive_from_http(url=s3_url, output_dir=model_dir)
|
|
|
|
LAUNCH_GRAPHDB = os.environ.get("LAUNCH_GRAPHDB", True)
|
|
|
|
# Start a GraphDB server
|
|
if LAUNCH_GRAPHDB:
|
|
print("Starting GraphDB ...")
|
|
status = subprocess.run(
|
|
[
|
|
"docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11"
|
|
],
|
|
shell=True,
|
|
)
|
|
if status.returncode:
|
|
status = subprocess.run(["docker start graphdb-instance-tutorial"], shell=True)
|
|
if status.returncode:
|
|
raise Exception(
|
|
"Failed to launch GraphDB. If you want to connect to an already running GraphDB instance"
|
|
"then set LAUNCH_GRAPHDB in the script to False."
|
|
)
|
|
time.sleep(5)
|
|
|
|
# Initialize a knowledge graph connected to GraphDB and use "tutorial_10_index" as the name of the index
|
|
kg = GraphDBKnowledgeGraph(index="tutorial_10_index")
|
|
|
|
# Delete the index as it might have been already created in previous runs
|
|
kg.delete_index()
|
|
|
|
# Create the index based on a configuration file
|
|
kg.create_index(config_path=Path(graph_dir + "repo-config.ttl"))
|
|
|
|
# Import triples of subject, predicate, and object statements from a ttl file
|
|
kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir + "triples.ttl"))
|
|
print(f"The last triple stored in the knowledge graph is: {kg.get_all_triples()[-1]}")
|
|
print(f"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.")
|
|
|
|
# Define prefixes for names of resources so that we can use shorter resource names in queries
|
|
prefixes = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
|
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
|
|
PREFIX hp: <https://deepset.ai/harry_potter/>
|
|
"""
|
|
kg.prefixes = prefixes
|
|
|
|
# Load a pre-trained model that translates text queries to SPARQL queries
|
|
kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg, model_name_or_path=model_dir + "hp_v3.4")
|
|
|
|
# We can now ask questions that will be answered by our knowledge graph!
|
|
# One limitation though: our pre-trained model can only generate questions about resources it has seen during training.
|
|
# Otherwise, it cannot translate the name of the resource to the identifier used in the knowledge graph.
|
|
# E.g. "Harry" -> "hp:Harry_potter"
|
|
|
|
query = "In which house is Harry Potter?"
|
|
print(f'Translating the text query "{query}" to a SPARQL query and executing it on the knowledge graph...')
|
|
result = kgqa_retriever.retrieve(query=query)
|
|
print(result)
|
|
# Correct SPARQL query: select ?a { hp:Harry_potter hp:house ?a . }
|
|
# Correct answer: Gryffindor
|
|
|
|
print("Executing a SPARQL query with prefixed names of resources...")
|
|
result = kgqa_retriever._query_kg(
|
|
sparql_query="select distinct ?sbj where { ?sbj hp:job hp:Keeper_of_keys_and_grounds . }"
|
|
)
|
|
print(result)
|
|
# Paraphrased question: Who is the keeper of keys and grounds?
|
|
# Correct answer: Rubeus Hagrid
|
|
|
|
print("Executing a SPARQL query with full names of resources...")
|
|
result = kgqa_retriever._query_kg(
|
|
sparql_query="select distinct ?obj where { <https://deepset.ai/harry_potter/Hermione_granger> <https://deepset.ai/harry_potter/patronus> ?obj . }"
|
|
)
|
|
print(result)
|
|
# Paraphrased question: What is the patronus of Hermione?
|
|
# Correct answer: Otter
|
|
|
|
|
|
if __name__ == "__main__":
|
|
tutorial10_knowledge_graph()
|
|
|
|
# This Haystack script was made with love by deepset in Berlin, Germany
|
|
# Haystack: https://github.com/deepset-ai/haystack
|
|
# deepset: https://deepset.ai/
|