# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
'\nCopyright (c) Microsoft Corporation. All rights reserved.\n'
import os
from pathlib import Path
import pandas as pd
import tiktoken
from graphrag.query.indexer_adapters import read_indexer_reports
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.global_search.community_context import (
GlobalCommunityContext,
)
from graphrag.query.structured_search.global_search.search import GlobalSearch
print(Path.cwd())
Global Search example
Global search method generates answers by searching over all AI-generated community reports in a map-reduce fashion. This is a resource-intensive method, but often gives good responses for questions that require an understanding of the dataset as a whole (e.g. What are the most significant values of the herbs mentioned in this notebook?).
LLM setup
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
llm = ChatOpenAI(
api_key=api_key,
model=llm_model,
api_type=OpenaiApiType.OpenAI, # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
max_retries=20,
)
token_encoder = tiktoken.get_encoding("cl100k_base")
Load community reports as context for global search
- Load all community reports from create_final_community_reports table from the ire-indexing engine.
# parquet files generated from indexing pipeline
INPUT_DIR = "./inputs/operation dulce"
COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
# community level in the Leiden community hierarchy from which we will load the community reports
# higher value means we use reports on smaller communities (and thus will have more reports to query aga
COMMUNITY_LEVEL = 2
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
print(f"Report records: {len(report_df)}")
report_df.head()
Build global context based on community reports
context_builder = GlobalCommunityContext(
community_reports=reports, token_encoder=token_encoder
)
Perform global search
context_builder_params = {
"use_community_summary": False, # False means using full community reports. True means using community short summaries.
"shuffle_data": True,
"include_community_rank": True,
"min_community_rank": 0,
"max_tokens": 12_000, # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
"context_name": "Reports",
}
map_llm_params = {
"max_tokens": 500,
"temperature": 0.0,
}
reduce_llm_params = {
"max_tokens": 2000, # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500)
"temperature": 0.0,
}
search_engine = GlobalSearch(
llm=llm,
context_builder=context_builder,
token_encoder=token_encoder,
max_data_tokens=16_000, # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
map_llm_params=map_llm_params,
reduce_llm_params=reduce_llm_params,
context_builder_params=context_builder_params,
concurrent_coroutines=32,
response_type="multiple paragraphs", # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)
result = await search_engine.asearch(
"What is the major conflict in this story and who are the protagonist and antagonist?"
)
print(result.response)
# inspect the data used to build the context for the LLM responses
result.context_data["reports"]
# inspect number of LLM calls and tokens
print(f"LLM calls: {result.llm_calls}. LLM tokens: {result.prompt_tokens}")
LLM calls: 13. LLM tokens: 184660