mirror of
https://github.com/microsoft/autogen.git
synced 2025-07-24 17:31:41 +00:00

* add initial global search draft * add graphrag dep * fix local search embedding * linting * add from config constructor * remove draft notebook * update config factory and add docstrings * add graphrag sample * add sample prompts * update readme * update deps * Add API docs * Update python/samples/agentchat_graphrag/requirements.txt * Update python/samples/agentchat_graphrag/requirements.txt * update docstrings with snippet and doc ref * lint * improve set up instructions in docstring * lint * update lock * Update python/packages/autogen-ext/src/autogen_ext/tools/graphrag/_global_search.py Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com> * Update python/packages/autogen-ext/src/autogen_ext/tools/graphrag/_local_search.py Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com> * add unit tests * update lock * update uv lock * add docstring newlines * stubs and typing on graphrag tests * fix docstrings * fix mypy error * + linting and type fixes * type fix graphrag sample * Update python/packages/autogen-ext/src/autogen_ext/tools/graphrag/_global_search.py Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com> * Update python/packages/autogen-ext/src/autogen_ext/tools/graphrag/_local_search.py Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com> * Update python/samples/agentchat_graphrag/requirements.txt Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com> * update overrides * fix docstring client imports * additional docstring fix * add docstring missing import * use openai and fix db path * use console for displaying messages * add model config and gitignore * update readme * lint * Update python/samples/agentchat_graphrag/README.md * Update python/samples/agentchat_graphrag/README.md * Comment remaining azure config --------- Co-authored-by: Leonardo Pinheiro <lpinheiro@microsoft.com> Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com>
131 lines
3.4 KiB
YAML
131 lines
3.4 KiB
YAML
### This config file contains required core defaults that must be set, along with a handful of common optional settings.
|
|
### For a full list of available settings, see https://microsoft.github.io/graphrag/config/yaml/
|
|
|
|
### LLM settings ###
|
|
## There are a number of settings to tune the threading and token limits for LLM calls - check the docs.
|
|
|
|
encoding_model: cl100k_base # this needs to be matched to your model!
|
|
|
|
llm:
|
|
api_key: null
|
|
type: openai_chat # or azure_openai_chat
|
|
model: gpt-4o
|
|
model_supports_json: true # recommended if this is available for your model.
|
|
# audience: "https://cognitiveservices.azure.com/.default"
|
|
# api_base: https://<resource-name>.openai.azure.com
|
|
# api_version: 2024-08-01-preview
|
|
# deployment_name: gpt-4o
|
|
|
|
parallelization:
|
|
stagger: 0.3
|
|
# num_threads: 50
|
|
|
|
async_mode: threaded # or asyncio
|
|
|
|
embeddings:
|
|
async_mode: threaded # or asyncio
|
|
vector_store:
|
|
type: lancedb
|
|
db_uri: 'data/output/lancedb'
|
|
container_name: default
|
|
overwrite: true
|
|
llm:
|
|
api_key: null
|
|
type: openai_embedding # or azure_openai_embedding
|
|
model: text-embedding-3-small
|
|
# api_base: https://<resource-name>.openai.azure.com
|
|
# api_version: "2023-05-15"
|
|
# audience: "https://cognitiveservices.azure.com/.default"
|
|
# deployment_name: text-embedding-3-small
|
|
|
|
### Input settings ###
|
|
|
|
input:
|
|
type: file # or blob
|
|
file_type: text # or csv
|
|
base_dir: "data/input"
|
|
file_encoding: utf-8
|
|
file_pattern: ".*\\.txt$"
|
|
|
|
chunks:
|
|
size: 1200
|
|
overlap: 100
|
|
group_by_columns: [id]
|
|
|
|
### Storage settings ###
|
|
## If blob storage is specified in the following four sections,
|
|
## connection_string and container_name must be provided
|
|
|
|
cache:
|
|
type: file # or blob
|
|
base_dir: "cache"
|
|
|
|
reporting:
|
|
type: file # or console, blob
|
|
base_dir: "logs"
|
|
|
|
storage:
|
|
type: file # or blob
|
|
base_dir: "data/output"
|
|
|
|
## only turn this on if running `graphrag index` with custom settings
|
|
## we normally use `graphrag update` with the defaults
|
|
update_index_storage:
|
|
# type: file # or blob
|
|
# base_dir: "update_output"
|
|
|
|
### Workflow settings ###
|
|
|
|
skip_workflows: []
|
|
|
|
entity_extraction:
|
|
prompt: "prompts/entity_extraction.txt"
|
|
entity_types: [organization,person,geo,event]
|
|
max_gleanings: 1
|
|
|
|
summarize_descriptions:
|
|
prompt: "prompts/summarize_descriptions.txt"
|
|
max_length: 500
|
|
|
|
claim_extraction:
|
|
enabled: false
|
|
prompt: "prompts/claim_extraction.txt"
|
|
description: "Any claims or facts that could be relevant to information discovery."
|
|
max_gleanings: 1
|
|
|
|
community_reports:
|
|
prompt: "prompts/community_report.txt"
|
|
max_length: 2000
|
|
max_input_length: 8000
|
|
|
|
cluster_graph:
|
|
max_cluster_size: 10
|
|
|
|
embed_graph:
|
|
enabled: false # if true, will generate node2vec embeddings for nodes
|
|
|
|
umap:
|
|
enabled: false # if true, will generate UMAP embeddings for nodes
|
|
|
|
snapshots:
|
|
graphml: false
|
|
raw_entities: false
|
|
top_level_nodes: false
|
|
embeddings: false
|
|
transient: false
|
|
|
|
### Query settings ###
|
|
## The prompt locations are required here, but each search method has a number of optional knobs that can be tuned.
|
|
## See the config docs: https://microsoft.github.io/graphrag/config/yaml/#query
|
|
|
|
local_search:
|
|
prompt: "prompts/local_search_system_prompt.txt"
|
|
|
|
global_search:
|
|
map_prompt: "prompts/global_search_map_system_prompt.txt"
|
|
reduce_prompt: "prompts/global_search_reduce_system_prompt.txt"
|
|
knowledge_prompt: "prompts/global_search_knowledge_system_prompt.txt"
|
|
|
|
drift_search:
|
|
prompt: "prompts/drift_search_system_prompt.txt"
|