# GraphRAG Quickstart

### Prerequisite installs to run the quickstart notebook
Install 3rd party packages that are not part of the Python Standard Library

In [None]:
! pip install devtools pandas python-magic requests tqdm

In [None]:
import getpass
import json
import sys
import time
from pathlib import Path

import magic
import pandas as pd
import requests
from devtools import pprint
from tqdm import tqdm

### Configuration - API Key, file directions and API endpoints

#### Get API Key for API Management Service
For authentication, the API requires a *subscription key* to be passed in the header of all requests. To find this key, visit the Azure Portal. The API subscription key will be located under ` --> --> --> --> Primary Key`.

In [None]:
ocp_apim_subscription_key = getpass.getpass(
 "Enter the subscription key to the GraphRag APIM:"
)

#### Setup directories and API endpoint

The following parameters are required to access and use the GraphRAG solution accelerator API:
* file_directory
* storage_name
* index_name
* endpoint

For demonstration purposes, you may use the provided `get-wiki-articles.py` script to download a small set of wikipedia articles or provide your own data.

In [None]:
"""
These parameters must be defined by the user:

- file_directory: local directory where data files of interest are stored.
- storage_name: unique name for an Azure blob storage container where files will be uploaded.
- index_name: unique name for a single knowledge graph construction. Multiple indexes can be created from the same blob container of data.
- apim_url: the endpoint URL for GraphRAG service (this is the Gateway URL found in the APIM resource).
"""

file_directory = ""
storage_name = ""
index_name = ""
apim_url = ""

In [None]:
assert (
 file_directory != "" and storage_name != "" and index_name != "" and apim_url != ""
)

In [None]:
"""
"Ocp-Apim-Subscription-Key": 
 This is a custom HTTP header used by Azure API Management service (APIM) to 
 authenticate API requests. The value for this key should be set to the subscription 
 key provided by the Azure APIM instance in your GraphRAG resource group.
"""

headers = {"Ocp-Apim-Subscription-Key": ocp_apim_subscription_key}

## Upload Files to Storage Data

In [None]:
def upload_files(
 file_directory: str,
 storage_name: str,
 batch_size: int = 100,
 overwrite: bool = True,
 max_retries: int = 5,
) -> requests.Response | list[Path]:
 """
 Upload files to a blob storage container.

 Args:
 file_directory - a local directory of .txt files to upload. All files must be in utf-8 encoding.
 storage_name - a unique name for the Azure storage container.
 batch_size - the number of files to upload in a single batch.
 overwrite - whether or not to overwrite files if they already exist in the storage container.
 max_retries - the maximum number of times to retry uploading a batch of files if the API is busy.

 NOTE: Uploading files may sometimes fail if the blob container was recently deleted
 (i.e. a few seconds before. The solution "in practice" is to sleep a few seconds and try again.
 """
 url = apim_url + "/data"

 def upload_batch(
 files: list, storage_name: str, overwrite: bool, max_retries: int
 ) -> requests.Response:
 for _ in range(max_retries):
 response = requests.post(
 url=url,
 files=files,
 params={"storage_name": storage_name, "overwrite": overwrite},
 headers=headers,
 )
 # API may be busy, retry
 if response.status_code == 500:
 print("API busy. Sleeping and will try again.")
 time.sleep(10)
 continue
 return response
 return response

 batch_files = []
 accepted_file_types = ["text/plain"]
 filepaths = list(Path(file_directory).iterdir())
 for file in tqdm(filepaths):
 # validate that file is a file, has acceptable file type, has a .txt extension, and has utf-8 encoding
 if (
 not file.is_file()
 or file.suffix != ".txt"
 or magic.from_file(str(file), mime=True) not in accepted_file_types
 ):
 print(f"Skipping invalid file: {file}")
 continue
 # open and decode file as utf-8, ignore bad characters
 batch_files.append(
 ("files", open(file=file, mode="r", encoding="utf-8", errors="ignore"))
 )
 # upload batch of files
 if len(batch_files) == batch_size:
 response = upload_batch(batch_files, storage_name, overwrite, max_retries)
 # if response is not ok, return early
 if not response.ok:
 return response
 batch_files.clear()
 # upload remaining files
 if len(batch_files) > 0:
 response = upload_batch(batch_files, storage_name, overwrite, max_retries)
 return response

In [None]:
response = upload_files(
 file_directory=file_directory,
 storage_name=storage_name,
 batch_size=100,
 overwrite=True,
)
if not response.ok:
 print(response.text)
else:
 print(response)

## Create an Index

After data files have been uploaded, it is now possible to construct a knowledge graph by creating a search index. If an entity configuration is not provided, a default entity configuration will be used that has been shown to generally work well.

In [None]:
def build_index(
 storage_name: str,
 index_name: str,
) -> requests.Response:
 """Create a search index.
 This function kicks off a job that builds a knowledge graph (KG) index from files located in a blob storage container.
 """
 url = apim_url + "/index"
 request = {
 "storage_name": storage_name,
 "index_name": index_name
 }
 return requests.post(url, params=request, headers=headers)

In [None]:
response = build_index(
 storage_name=storage_name,
 index_name=index_name
)
print(response)
if response.ok:
 print(response.text)
else:
 print(f"Failed to submit job.\nStatus: {response.text}")

### Check the status of an indexing job

Please wait for your index to reach 100 percent complete before continuing on to the next section to run queries.

In [None]:
def index_status(index_name: str) -> requests.Response:
 url = apim_url + f"/index/status/{index_name}"
 return requests.get(url, headers=headers)

In [None]:
response = index_status(index_name)

pprint(response.json())

## Query

After an indexing job has completed, the knowledge graph is ready to query. Two types of queries (global and local) are currently supported. In addition, you can issue a query over a single index or multiple indexes.

In [None]:
"""Needed helper function to parse out the clear result from the query response. """
def parse_query_response(
 response: requests.Response, return_context_data: bool = False
) -> requests.Response | dict[list[dict]]:
 """
 Prints response['result'] value and optionally
 returns associated context data.
 """
 if response.ok:
 print(json.loads(response.text)["result"])
 if return_context_data:
 return json.loads(response.text)["context_data"]
 return response
 else:
 print(response.reason)
 print(response.content)
 return response

### Global Query 

Global search queries are resource-intensive, but give good responses to questions that require an understanding of the dataset as a whole.

In [None]:
def global_search(index_name: str | list[str], query: str) -> requests.Response:
 """Run a global query over the knowledge graph(s) associated with one or more indexes"""
 url = apim_url + "/query/global"
 request = {"index_name": index_name, "query": query}
 return requests.post(url, json=request, headers=headers)

In [None]:
%%time
# pass in a single index name as a string or to query across multiple indexes, set index_name=[myindex1, myindex2]
global_response = global_search(
 index_name=index_name, query="Summarize the main topics of this data"
)
# print the result and save context data in a variable
global_response_data = parse_query_response(global_response, return_context_data=True)
global_response_data

### Local Query

Local search queries are best suited for narrow-focused questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?)

In [None]:
def local_search(index_name: str | list[str], query: str) -> requests.Response:
 """Run a local query over the knowledge graph(s) associated with one or more indexes"""
 url = apim_url + "/query/local"
 request = {"index_name": index_name, "query": query}
 return requests.post(url, json=request, headers=headers)

In [None]:
%%time
# pass in a single index name as a string or to query across multiple indexes, set index_name=[myindex1, myindex2]
local_response = local_search(
 index_name=index_name, query="Who are the primary actors in these communities?"
)
# print the result and save context data in a variable
local_response_data = parse_query_response(local_response, return_context_data=True)
local_response_data