graphrag-accelerator/backend/src/api/pipeline-settings.yaml

92 lines
2.2 KiB
YAML
Raw Normal View History

2024-06-26 15:45:06 -04:00
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# this yaml file serves as a configuration template for the graphrag indexing jobs
# some values are hardcoded while others denoted by PLACEHOLDER will be dynamically set
2024-06-26 15:45:06 -04:00
input:
type: blob
file_type: text
file_pattern: .*\.txt$
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
container_name: PLACEHOLDER
base_dir: .
storage:
type: blob
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
container_name: PLACEHOLDER
base_dir: output
reporting:
type: blob
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
container_name: PLACEHOLDER
base_dir: logs
cache:
type: blob
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
container_name: PLACEHOLDER
base_dir: cache
llm:
type: azure_openai_chat
api_base: $GRAPHRAG_API_BASE
api_version: $GRAPHRAG_API_VERSION
model: $GRAPHRAG_LLM_MODEL
deployment_name: $GRAPHRAG_LLM_DEPLOYMENT_NAME
cognitive_services_endpoint: $GRAPHRAG_COGNITIVE_SERVICES_ENDPOINT
model_supports_json: True
tokens_per_minute: 80000
requests_per_minute: 480
thread_count: 50
concurrent_requests: 25
parallelization:
stagger: 0.25
num_threads: 10
async_mode: threaded
embeddings:
async_mode: threaded
llm:
type: azure_openai_embedding
api_base: $GRAPHRAG_API_BASE
api_version: $GRAPHRAG_API_VERSION
batch_size: 16
model: $GRAPHRAG_EMBEDDING_MODEL
deployment_name: $GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME
cognitive_services_endpoint: $GRAPHRAG_COGNITIVE_SERVICES_ENDPOINT
tokens_per_minute: 350000
concurrent_requests: 25
requests_per_minute: 2100
thread_count: 50
max_retries: 50
parallelization:
stagger: 0.25
num_threads: 10
vector_store:
type: azure_ai_search
collection_name: PLACEHOLDER
title_column: name
overwrite: True
url: $AI_SEARCH_URL
audience: $AI_SEARCH_AUDIENCE
entity_extraction:
prompt: PLACEHOLDER
2024-06-26 15:45:06 -04:00
community_reports:
prompt: PLACEHOLDER
2024-06-26 15:45:06 -04:00
summarize_descriptions:
prompt: PLACEHOLDER
2024-06-26 15:45:06 -04:00
# claim extraction is disabled by default in the graphrag library so we enable it for the solution accelerator
claim_extraction:
enabled: True
2024-06-26 15:45:06 -04:00
snapshots:
graphml: True