2024-06-26 15:45:06 -04:00
|
|
|
# Copyright (c) Microsoft Corporation.
|
|
|
|
# Licensed under the MIT License.
|
|
|
|
|
|
|
|
# this yaml file serves as a configuration template for the graphrag indexing jobs
|
2024-06-27 00:23:38 -04:00
|
|
|
# some values are hardcoded while others denoted by PLACEHOLDER will be dynamically set
|
2024-06-26 15:45:06 -04:00
|
|
|
input:
|
|
|
|
type: blob
|
|
|
|
file_type: text
|
|
|
|
file_pattern: .*\.txt$
|
|
|
|
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
|
|
|
|
container_name: PLACEHOLDER
|
|
|
|
base_dir: .
|
|
|
|
|
|
|
|
storage:
|
|
|
|
type: blob
|
|
|
|
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
|
|
|
|
container_name: PLACEHOLDER
|
|
|
|
base_dir: output
|
|
|
|
|
|
|
|
reporting:
|
|
|
|
type: blob
|
|
|
|
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
|
|
|
|
container_name: PLACEHOLDER
|
|
|
|
base_dir: logs
|
|
|
|
|
|
|
|
cache:
|
|
|
|
type: blob
|
|
|
|
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
|
|
|
|
container_name: PLACEHOLDER
|
|
|
|
base_dir: cache
|
|
|
|
|
|
|
|
llm:
|
|
|
|
type: azure_openai_chat
|
|
|
|
api_base: $GRAPHRAG_API_BASE
|
|
|
|
api_version: $GRAPHRAG_API_VERSION
|
|
|
|
model: $GRAPHRAG_LLM_MODEL
|
|
|
|
deployment_name: $GRAPHRAG_LLM_DEPLOYMENT_NAME
|
|
|
|
cognitive_services_endpoint: $GRAPHRAG_COGNITIVE_SERVICES_ENDPOINT
|
|
|
|
model_supports_json: True
|
|
|
|
tokens_per_minute: 80000
|
|
|
|
requests_per_minute: 480
|
|
|
|
thread_count: 50
|
|
|
|
concurrent_requests: 25
|
|
|
|
|
|
|
|
parallelization:
|
|
|
|
stagger: 0.25
|
|
|
|
num_threads: 10
|
|
|
|
|
|
|
|
async_mode: threaded
|
|
|
|
|
|
|
|
embeddings:
|
|
|
|
async_mode: threaded
|
|
|
|
llm:
|
|
|
|
type: azure_openai_embedding
|
|
|
|
api_base: $GRAPHRAG_API_BASE
|
|
|
|
api_version: $GRAPHRAG_API_VERSION
|
|
|
|
batch_size: 16
|
|
|
|
model: $GRAPHRAG_EMBEDDING_MODEL
|
|
|
|
deployment_name: $GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME
|
|
|
|
cognitive_services_endpoint: $GRAPHRAG_COGNITIVE_SERVICES_ENDPOINT
|
|
|
|
tokens_per_minute: 350000
|
|
|
|
concurrent_requests: 25
|
|
|
|
requests_per_minute: 2100
|
|
|
|
thread_count: 50
|
|
|
|
max_retries: 50
|
|
|
|
parallelization:
|
|
|
|
stagger: 0.25
|
|
|
|
num_threads: 10
|
|
|
|
vector_store:
|
|
|
|
type: azure_ai_search
|
|
|
|
collection_name: PLACEHOLDER
|
|
|
|
title_column: name
|
|
|
|
overwrite: True
|
|
|
|
url: $AI_SEARCH_URL
|
|
|
|
audience: $AI_SEARCH_AUDIENCE
|
|
|
|
|
2024-06-27 00:23:38 -04:00
|
|
|
entity_extraction:
|
|
|
|
prompt: PLACEHOLDER
|
2024-06-26 15:45:06 -04:00
|
|
|
|
2024-06-27 00:23:38 -04:00
|
|
|
community_reports:
|
|
|
|
prompt: PLACEHOLDER
|
2024-06-26 15:45:06 -04:00
|
|
|
|
2024-06-27 00:23:38 -04:00
|
|
|
summarize_descriptions:
|
|
|
|
prompt: PLACEHOLDER
|
2024-06-26 15:45:06 -04:00
|
|
|
|
2024-07-01 12:31:50 -04:00
|
|
|
# claim extraction is disabled by default in the graphrag library so we enable it for the solution accelerator
|
|
|
|
claim_extraction:
|
|
|
|
enabled: True
|
|
|
|
|
2024-06-26 15:45:06 -04:00
|
|
|
snapshots:
|
|
|
|
graphml: True
|