Leonardo Pinheiro 95bd514a9a
Graphrag integration (#4612)
* add initial global search draft

* add graphrag dep

* fix local search embedding

* linting

* add from config constructor

* remove draft notebook

* update config factory and add docstrings

* add graphrag sample

* add sample prompts

* update readme

* update deps

* Add API docs

* Update python/samples/agentchat_graphrag/requirements.txt

* Update python/samples/agentchat_graphrag/requirements.txt

* update docstrings with snippet and doc ref

* lint

* improve set up instructions in docstring

* lint

* update lock

* Update python/packages/autogen-ext/src/autogen_ext/tools/graphrag/_global_search.py

Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com>

* Update python/packages/autogen-ext/src/autogen_ext/tools/graphrag/_local_search.py

Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com>

* add unit tests

* update lock

* update uv lock

* add docstring newlines

* stubs and typing on graphrag tests

* fix docstrings

* fix mypy error

* + linting and type fixes

* type fix graphrag sample

* Update python/packages/autogen-ext/src/autogen_ext/tools/graphrag/_global_search.py

Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com>

* Update python/packages/autogen-ext/src/autogen_ext/tools/graphrag/_local_search.py

Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com>

* Update python/samples/agentchat_graphrag/requirements.txt

Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com>

* update overrides

* fix docstring client imports

* additional docstring fix

* add docstring missing import

* use openai and fix db path

* use console for displaying messages

* add model config and gitignore

* update readme

* lint

* Update python/samples/agentchat_graphrag/README.md

* Update python/samples/agentchat_graphrag/README.md

* Comment remaining azure config

---------

Co-authored-by: Leonardo Pinheiro <lpinheiro@microsoft.com>
Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com>
2025-01-15 21:04:17 +10:00

131 lines
3.4 KiB
YAML

### This config file contains required core defaults that must be set, along with a handful of common optional settings.
### For a full list of available settings, see https://microsoft.github.io/graphrag/config/yaml/
### LLM settings ###
## There are a number of settings to tune the threading and token limits for LLM calls - check the docs.
encoding_model: cl100k_base # this needs to be matched to your model!
llm:
api_key: null
type: openai_chat # or azure_openai_chat
model: gpt-4o
model_supports_json: true # recommended if this is available for your model.
# audience: "https://cognitiveservices.azure.com/.default"
# api_base: https://<resource-name>.openai.azure.com
# api_version: 2024-08-01-preview
# deployment_name: gpt-4o
parallelization:
stagger: 0.3
# num_threads: 50
async_mode: threaded # or asyncio
embeddings:
async_mode: threaded # or asyncio
vector_store:
type: lancedb
db_uri: 'data/output/lancedb'
container_name: default
overwrite: true
llm:
api_key: null
type: openai_embedding # or azure_openai_embedding
model: text-embedding-3-small
# api_base: https://<resource-name>.openai.azure.com
# api_version: "2023-05-15"
# audience: "https://cognitiveservices.azure.com/.default"
# deployment_name: text-embedding-3-small
### Input settings ###
input:
type: file # or blob
file_type: text # or csv
base_dir: "data/input"
file_encoding: utf-8
file_pattern: ".*\\.txt$"
chunks:
size: 1200
overlap: 100
group_by_columns: [id]
### Storage settings ###
## If blob storage is specified in the following four sections,
## connection_string and container_name must be provided
cache:
type: file # or blob
base_dir: "cache"
reporting:
type: file # or console, blob
base_dir: "logs"
storage:
type: file # or blob
base_dir: "data/output"
## only turn this on if running `graphrag index` with custom settings
## we normally use `graphrag update` with the defaults
update_index_storage:
# type: file # or blob
# base_dir: "update_output"
### Workflow settings ###
skip_workflows: []
entity_extraction:
prompt: "prompts/entity_extraction.txt"
entity_types: [organization,person,geo,event]
max_gleanings: 1
summarize_descriptions:
prompt: "prompts/summarize_descriptions.txt"
max_length: 500
claim_extraction:
enabled: false
prompt: "prompts/claim_extraction.txt"
description: "Any claims or facts that could be relevant to information discovery."
max_gleanings: 1
community_reports:
prompt: "prompts/community_report.txt"
max_length: 2000
max_input_length: 8000
cluster_graph:
max_cluster_size: 10
embed_graph:
enabled: false # if true, will generate node2vec embeddings for nodes
umap:
enabled: false # if true, will generate UMAP embeddings for nodes
snapshots:
graphml: false
raw_entities: false
top_level_nodes: false
embeddings: false
transient: false
### Query settings ###
## The prompt locations are required here, but each search method has a number of optional knobs that can be tuned.
## See the config docs: https://microsoft.github.io/graphrag/config/yaml/#query
local_search:
prompt: "prompts/local_search_system_prompt.txt"
global_search:
map_prompt: "prompts/global_search_map_system_prompt.txt"
reduce_prompt: "prompts/global_search_reduce_system_prompt.txt"
knowledge_prompt: "prompts/global_search_knowledge_system_prompt.txt"
drift_search:
prompt: "prompts/drift_search_system_prompt.txt"