mirror of
https://github.com/Azure-Samples/graphrag-accelerator.git
synced 2025-07-24 09:50:07 +00:00
138 lines
3.3 KiB
YAML
138 lines
3.3 KiB
YAML
# Copyright (c) Microsoft Corporation.
|
|
# Licensed under the MIT License.
|
|
|
|
# this yaml file serves as a configuration template for the graphrag indexing jobs
|
|
# some values are hardcoded while others denoted by PLACEHOLDER will be dynamically set
|
|
|
|
###################### LLM settings ######################
|
|
encoding_model: cl100k_base # this needs to be matched to your model!
|
|
|
|
llm:
|
|
type: azure_openai_chat
|
|
api_base: $GRAPHRAG_API_BASE
|
|
api_version: $GRAPHRAG_API_VERSION
|
|
model: $GRAPHRAG_LLM_MODEL
|
|
deployment_name: $GRAPHRAG_LLM_DEPLOYMENT_NAME
|
|
cognitive_services_endpoint: $COGNITIVE_SERVICES_AUDIENCE
|
|
model_supports_json: True
|
|
tokens_per_minute: 80_000
|
|
requests_per_minute: 480
|
|
concurrent_requests: 25
|
|
max_retries: 250
|
|
max_retry_wait: 60.0
|
|
sleep_on_rate_limit_recommendation: True
|
|
|
|
parallelization:
|
|
num_threads: 10
|
|
stagger: 0.25
|
|
|
|
async_mode: threaded # or asyncio
|
|
|
|
embeddings:
|
|
vector_store:
|
|
type: azure_ai_search
|
|
collection_name: PLACEHOLDER
|
|
title_column: name
|
|
overwrite: True
|
|
url: $AI_SEARCH_URL
|
|
audience: $AI_SEARCH_AUDIENCE
|
|
llm:
|
|
type: azure_openai_embedding
|
|
api_base: $GRAPHRAG_API_BASE
|
|
api_version: $GRAPHRAG_API_VERSION
|
|
batch_size: 10
|
|
model: $GRAPHRAG_EMBEDDING_MODEL
|
|
deployment_name: $GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME
|
|
cognitive_services_endpoint: $COGNITIVE_SERVICES_AUDIENCE
|
|
tokens_per_minute: 350_000
|
|
requests_per_minute: 2_100
|
|
|
|
###################### Input settings ######################
|
|
input:
|
|
type: blob
|
|
file_type: text
|
|
base_dir: .
|
|
file_encoding: utf-8
|
|
file_pattern: .*\.txt$
|
|
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
|
|
container_name: PLACEHOLDER
|
|
|
|
chunks:
|
|
size: 1_200
|
|
overlap: 100
|
|
group_by_columns: [id]
|
|
|
|
###################### Storage settings ######################
|
|
cache:
|
|
type: blob
|
|
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
|
|
container_name: PLACEHOLDER
|
|
base_dir: cache
|
|
|
|
reporting:
|
|
type: blob
|
|
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
|
|
container_name: PLACEHOLDER
|
|
base_dir: logs
|
|
|
|
storage:
|
|
type: blob
|
|
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
|
|
container_name: PLACEHOLDER
|
|
base_dir: output
|
|
|
|
###################### Workflow settings ######################
|
|
skip_workflows: []
|
|
|
|
entity_extraction:
|
|
prompt: PLACEHOLDER
|
|
entity_types: [organization, person, geo, event]
|
|
max_gleanings: 1
|
|
|
|
summarize_descriptions:
|
|
prompt: PLACEHOLDER
|
|
max_length: 500
|
|
|
|
claim_extraction:
|
|
enabled: false
|
|
prompt: "prompts/claim_extraction.txt"
|
|
description: "Any claims or facts that could be relevant to information discovery."
|
|
max_gleanings: 1
|
|
|
|
community_reports:
|
|
prompt: PLACEHOLDER
|
|
max_length: 2_000
|
|
max_input_length: 8_000
|
|
|
|
cluster_graph:
|
|
max_cluster_size: 10
|
|
|
|
embed_graph:
|
|
enabled: false
|
|
|
|
umap:
|
|
enabled: false
|
|
|
|
snapshots:
|
|
graphml: True
|
|
embeddings: false
|
|
transient: false
|
|
|
|
###################### Query settings ######################
|
|
## The prompt locations are required here, but each search method has a number of optional knobs that can be tuned.
|
|
## See the config docs: https://microsoft.github.io/graphrag/config/yaml/#query
|
|
local_search:
|
|
prompt: PLACEHOLDER
|
|
|
|
global_search:
|
|
map_prompt: PLACEHOLDER
|
|
reduce_prompt: PLACEHOLDER
|
|
knowledge_prompt: PLACEHOLDER
|
|
|
|
drift_search:
|
|
prompt: PLACEHOLDER
|
|
reduce_prompt: PLACEHOLDER
|
|
|
|
basic_search:
|
|
prompt: PLACEHOLDER
|