From 76b88dbaec2a28c41d18fb969026b5ecaead27ea Mon Sep 17 00:00:00 2001 From: AlonsoGuevara Date: Tue, 23 Apr 2024 22:13:30 +0000 Subject: [PATCH] =?UTF-8?q?Deploying=20to=20gh-pages=20from=20@=20microsof?= =?UTF-8?q?t/graphrag@b3855a5f7f0b01630a99db5e1c33da567e3a16a7=20?= =?UTF-8?q?=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../notebooks/global_search_nb/index.html | 447 +++------------ .../notebooks/local_search_nb/index.html | 529 +++++------------- 2 files changed, 210 insertions(+), 766 deletions(-) diff --git a/posts/query/notebooks/global_search_nb/index.html b/posts/query/notebooks/global_search_nb/index.html index 435c4169..bd1209ed 100644 --- a/posts/query/notebooks/global_search_nb/index.html +++ b/posts/query/notebooks/global_search_nb/index.html @@ -1,299 +1,31 @@ - - - - - - - - - Global Search Notebook - - - - - - - - - - - - - - -
- - GraphRAG -
-
- - - - -
-

Global Search Notebook

-
-
"""
-Copyright (c) Microsoft Corporation. All rights reserved.
-"""
+
# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
+

'\nCopyright (c) Microsoft Corporation. All rights reserved.\n'

-
import os
+  
import os
+from pathlib import Path
+
 import pandas as pd
 import tiktoken
+
+from graphrag.query.input.loaders.dfs import read_community_reports
 from graphrag.query.llm.oai.chat_openai import ChatOpenAI
 from graphrag.query.llm.oai.typing import OpenaiApiType
-from graphrag.query.input.loaders.dfs import read_community_reports
+from graphrag.query.structured_search.global_search.community_context import (
+    GlobalCommunityContext,
+)
 from graphrag.query.structured_search.global_search.search import GlobalSearch
-from graphrag.query.structured_search.global_search.community_context import GlobalCommunityContext
 
-print(os.getcwd())
+print(Path.cwd())
-
@@ -302,22 +34,19 @@ Copyright (c) Microsoft Corporation. All rights reserved.

LLM setup

-

-api_key = "<api_key>"
-api_version = "api_version"
-llm_model = "model or deployment id"
+  
api_key = os.environ["GRAPHRAG_API_KEY"]
+llm_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
 
 llm = ChatOpenAI(
     api_key=api_key,
     model=llm_model,
-    api_type=OpenaiApiType.OpenAI, # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
-    api_version=api_version,
-    max_retries=20
+    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
+    max_retries=20,
 )
 
 token_encoder = tiktoken.get_encoding("cl100k_base")
-
@@ -327,35 +56,38 @@ token_encoder = tiktoken -
# parquet files generated from indexing pipeline
-INPUT_DIR = "./data"
+  
# parquet files generated from indexing pipeline
+INPUT_DIR = "./inputs/operation dulce"
 COMMUNITY_REPORT_TABLE = "create_final_community_reports"
 ENTITY_TABLE = "create_final_nodes"
 
 # community level in the Leiden community hierarchy from which we will load the community reports
 # higher value means we use reports on smaller communities (and thus will have more reports to query aga
-COMMUNITY_LEVEL = 2
-
+COMMUNITY_LEVEL = 2
-
-

-entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
-entity_df = entity_df[(entity_df.type=="entity") & (entity_df.level<="level_{COMMUNITY_LEVEL}")]
+  
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
+entity_df = entity_df[
+    (entity_df.type == "entity") & (entity_df.level <= f"level_{COMMUNITY_LEVEL}")
+]
 entity_df["community"] = entity_df["community"].fillna(-1)
 entity_df["community"] = entity_df["community"].astype(int)
 
-entity_df = entity_df.groupby(["title"]).agg({"community": "max"}).resetindex()
+entity_df = entity_df.groupby(["title"]).agg({"community": "max"}).reset_index()
 entity_df["community"] = entity_df["community"].astype(str)
-filtered_community_df = entity_df.rename(columns={"community": "community_id"})["community_id"].drop_duplicates()
+filtered_community_df = entity_df.rename(columns={"community": "community_id"})[
+    "community_id"
+].drop_duplicates()
 
 report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
 report_df = report_df[report_df.level <= f"level_{COMMUNITY_LEVEL}"]
 
+report_df["rank"] = report_df["rank"].fillna(-1)
 report_df["rank"] = report_df["rank"].astype(int)
 
 report_df = report_df.merge(filtered_community_df, on="community_id", how="inner")
@@ -373,34 +105,33 @@ reports = read_community_reports=None,
 )
 
-print(f'Report records: {len(report_df)}')
+print(f"Report records: {len(report_df)}")
 report_df.head()
-

Build global context based on community reports

-
context_builder = GlobalCommunityContext(
-    community_reports=reports,
-    token_encoder=token_encoder
+  
context_builder = GlobalCommunityContext(
+    community_reports=reports, token_encoder=token_encoder
 )
-

Perform global search

-
context_builder_params = {
-    "use_community_summary": False, # False means using full community reports. True means using community short summaries.
+  
context_builder_params = {
+    "use_community_summary": False,  # False means using full community reports. True means using community short summaries.
     "shuffle_data": True,
     "include_community_rank": True,
     "min_community_rank": 0,
-    "max_tokens": 16000, # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
+    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
     "context_name": "Reports",
 }
 
@@ -410,78 +141,60 @@ map_llm_params = }
 
 reduce_llm_params = {
-    "max_tokens": 2000, # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500)
+    "max_tokens": 2000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500)
     "temperature": 0.0,
 }
- -
- -
-
search_engine = GlobalSearch(
-    llm=llm,
-    context_builder=context_builder,
-    token_encoder=token_encoder,
-    max_data_tokens = 16000, # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
-    map_llm_params=map_llm_params,
-    reduce_llm_params=reduce_llm_params,
-    context_builder_params=context_builder_params,
-    concurrent_coroutines=32,
-    response_type="multiple paragraphs" # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
-)
- - -
- -
-
result = await search_engine.asearch('How reliable are medicinal herbs?​')
-
-print(result.response)
- - -
- -
-
# inspect the data used to build the context for the LLM responses
-result.context_data["reports"]
-
-
# inspect number of LLM calls and tokens
-print(f'LLM calls: {result.llm_calls}. LLM tokens: {result.prompt_tokens}')
+
search_engine = GlobalSearch(
+    llm=llm,
+    context_builder=context_builder,
+    token_encoder=token_encoder,
+    max_data_tokens=16_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
+    map_llm_params=map_llm_params,
+    reduce_llm_params=reduce_llm_params,
+    context_builder_params=context_builder_params,
+    concurrent_coroutines=32,
+    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
+)
-

LLM calls: 13. LLM tokens: 184660

- - - - - \ No newline at end of file +
+
result = await search_engine.asearch(
+    "What is the major conflict in this story and who are the protagonist and antagonist?"
+)
+
+print(result.response)
+ + +
+ +
+
# inspect the data used to build the context for the LLM responses
+result.context_data["reports"]
+ + +
+ +
+
# inspect number of LLM calls and tokens
+print(f"LLM calls: {result.llm_calls}. LLM tokens: {result.prompt_tokens}")
+ + +
+

LLM calls: 13. LLM tokens: 184660

diff --git a/posts/query/notebooks/local_search_nb/index.html b/posts/query/notebooks/local_search_nb/index.html index 5af01222..8bc08d5a 100644 --- a/posts/query/notebooks/local_search_nb/index.html +++ b/posts/query/notebooks/local_search_nb/index.html @@ -1,280 +1,6 @@ - - - - - - - - - Local Search Notebook - - - - - - - - - - - - - - -
- - GraphRAG -
-
- - - - -
-

Local Search Notebook

-
-
"""
-Copyright (c) Microsoft Corporation. All rights reserved.
-"""
+
# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
- -
-
- - - \ No newline at end of file