graphrag-accelerator/backend/src/api/pipeline-settings.yaml

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

# this yaml file serves as a configuration template for the graphrag indexing jobs
# some values are hardcoded while others denoted by PLACEHOLDER will be dynamically set
input:
  type: blob
  file_type: text
  file_pattern: .*\.txt$
  storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
  container_name: PLACEHOLDER
  base_dir: .

storage:
  type: blob
  storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
  container_name: PLACEHOLDER
  base_dir: output

reporting:
  type: blob
  storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
  container_name: PLACEHOLDER
  base_dir: logs

cache:
  type: blob
  storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
  container_name: PLACEHOLDER
  base_dir: cache

llm:
  type: azure_openai_chat
  api_base: $GRAPHRAG_API_BASE
  api_version: $GRAPHRAG_API_VERSION
  model: $GRAPHRAG_LLM_MODEL
  deployment_name: $GRAPHRAG_LLM_DEPLOYMENT_NAME
  cognitive_services_endpoint: $GRAPHRAG_COGNITIVE_SERVICES_ENDPOINT
  model_supports_json: True
  tokens_per_minute: 80000
  requests_per_minute: 480
  thread_count: 50
  concurrent_requests: 25

parallelization:
  stagger: 0.25
  num_threads: 10

async_mode: threaded

embeddings:
  async_mode: threaded
  llm:
    type: azure_openai_embedding
    api_base: $GRAPHRAG_API_BASE
    api_version: $GRAPHRAG_API_VERSION
    batch_size: 16
    model: $GRAPHRAG_EMBEDDING_MODEL
    deployment_name: $GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME
    cognitive_services_endpoint: $GRAPHRAG_COGNITIVE_SERVICES_ENDPOINT
    tokens_per_minute: 350000
    concurrent_requests: 25
    requests_per_minute: 2100
    thread_count: 50
    max_retries: 50
  parallelization:
    stagger: 0.25
    num_threads: 10
  vector_store:
    type: azure_ai_search
    collection_name: PLACEHOLDER
    title_column: name
    overwrite: True
    url: $AI_SEARCH_URL
    audience: $AI_SEARCH_AUDIENCE

entity_extraction:
  prompt: PLACEHOLDER

community_reports:
  prompt: PLACEHOLDER

summarize_descriptions:
  prompt: PLACEHOLDER

# claim extraction is disabled by default in the graphrag library so we enable it for the solution accelerator
claim_extraction:
  enabled: True

snapshots:
  graphml: True
initial commit 2024-06-26 15:45:06 -04:00			`# Copyright (c) Microsoft Corporation.`
			`# Licensed under the MIT License.`

			`# this yaml file serves as a configuration template for the graphrag indexing jobs`
code cleanup and removal of comments (#27) 2024-06-27 00:23:38 -04:00			`# some values are hardcoded while others denoted by PLACEHOLDER will be dynamically set`
initial commit 2024-06-26 15:45:06 -04:00			`input:`
			`type: blob`
			`file_type: text`
			`file_pattern: .*\.txt$`
			`storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL`
			`container_name: PLACEHOLDER`
			`base_dir: .`

			`storage:`
			`type: blob`
			`storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL`
			`container_name: PLACEHOLDER`
			`base_dir: output`

			`reporting:`
			`type: blob`
			`storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL`
			`container_name: PLACEHOLDER`
			`base_dir: logs`

			`cache:`
			`type: blob`
			`storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL`
			`container_name: PLACEHOLDER`
			`base_dir: cache`

			`llm:`
			`type: azure_openai_chat`
			`api_base: $GRAPHRAG_API_BASE`
			`api_version: $GRAPHRAG_API_VERSION`
			`model: $GRAPHRAG_LLM_MODEL`
			`deployment_name: $GRAPHRAG_LLM_DEPLOYMENT_NAME`
			`cognitive_services_endpoint: $GRAPHRAG_COGNITIVE_SERVICES_ENDPOINT`
			`model_supports_json: True`
			`tokens_per_minute: 80000`
			`requests_per_minute: 480`
			`thread_count: 50`
			`concurrent_requests: 25`

			`parallelization:`
			`stagger: 0.25`
			`num_threads: 10`

			`async_mode: threaded`

			`embeddings:`
			`async_mode: threaded`
			`llm:`
			`type: azure_openai_embedding`
			`api_base: $GRAPHRAG_API_BASE`
			`api_version: $GRAPHRAG_API_VERSION`
			`batch_size: 16`
			`model: $GRAPHRAG_EMBEDDING_MODEL`
			`deployment_name: $GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME`
			`cognitive_services_endpoint: $GRAPHRAG_COGNITIVE_SERVICES_ENDPOINT`
			`tokens_per_minute: 350000`
			`concurrent_requests: 25`
			`requests_per_minute: 2100`
			`thread_count: 50`
			`max_retries: 50`
			`parallelization:`
			`stagger: 0.25`
			`num_threads: 10`
			`vector_store:`
			`type: azure_ai_search`
			`collection_name: PLACEHOLDER`
			`title_column: name`
			`overwrite: True`
			`url: $AI_SEARCH_URL`
			`audience: $AI_SEARCH_AUDIENCE`

code cleanup and removal of comments (#27) 2024-06-27 00:23:38 -04:00			`entity_extraction:`
			`prompt: PLACEHOLDER`
initial commit 2024-06-26 15:45:06 -04:00
code cleanup and removal of comments (#27) 2024-06-27 00:23:38 -04:00			`community_reports:`
			`prompt: PLACEHOLDER`
initial commit 2024-06-26 15:45:06 -04:00
code cleanup and removal of comments (#27) 2024-06-27 00:23:38 -04:00			`summarize_descriptions:`
			`prompt: PLACEHOLDER`
initial commit 2024-06-26 15:45:06 -04:00
Enable claim extraction by default (#38) 2024-07-01 12:31:50 -04:00			`# claim extraction is disabled by default in the graphrag library so we enable it for the solution accelerator`
			`claim_extraction:`
			`enabled: True`

initial commit 2024-06-26 15:45:06 -04:00			`snapshots:`
			`graphml: True`