### This config file contains required core defaults that must be set, along with a handful of common optional settings. ### For a full list of available settings, see https://microsoft.github.io/graphrag/config/yaml/ ### LLM settings ### ## There are a number of settings to tune the threading and token limits for LLM calls - check the docs. models: default_chat_model: type: openai_chat # or azure_openai_chat # api_base: https://.openai.azure.com # api_version: 2024-05-01-preview auth_type: api_key # or azure_managed_identity api_key: ${GRAPHRAG_API_KEY} # set this in the generated .env file # audience: "https://cognitiveservices.azure.com/.default" # organization: model: gpt-4-turbo-preview # deployment_name: # encoding_model: cl100k_base # automatically set by tiktoken if left undefined model_supports_json: true # recommended if this is available for your model. concurrent_requests: 25 # max number of simultaneous LLM requests allowed async_mode: threaded # or asyncio retry_strategy: native max_retries: 10 tokens_per_minute: auto # set to null to disable rate limiting requests_per_minute: auto # set to null to disable rate limiting default_embedding_model: type: openai_embedding # or azure_openai_embedding # api_base: https://.openai.azure.com # api_version: 2024-05-01-preview auth_type: api_key # or azure_managed_identity api_key: ${GRAPHRAG_API_KEY} # audience: "https://cognitiveservices.azure.com/.default" # organization: model: text-embedding-3-small # deployment_name: # encoding_model: cl100k_base # automatically set by tiktoken if left undefined model_supports_json: true # recommended if this is available for your model. concurrent_requests: 25 # max number of simultaneous LLM requests allowed async_mode: threaded # or asyncio retry_strategy: native max_retries: 10 tokens_per_minute: auto # set to null to disable rate limiting requests_per_minute: auto # set to null to disable rate limiting ### Input settings ### input: type: file # or blob file_type: text # [csv, text, json] base_dir: "input" chunks: size: 1200 overlap: 100 group_by_columns: [id] ### Output/storage settings ### ## If blob storage is specified in the following four sections, ## connection_string and container_name must be provided output: type: file # [file, blob, cosmosdb] base_dir: "output" cache: type: file # [file, blob, cosmosdb] base_dir: "cache" reporting: type: file # [file, blob, cosmosdb] base_dir: "logs" vector_store: default_vector_store: type: lancedb db_uri: output/lancedb container_name: default overwrite: True ### Workflow settings ### embed_text: model_id: default_embedding_model vector_store_id: default_vector_store extract_graph: model_id: default_chat_model prompt: "prompts/extract_graph.txt" entity_types: [organization,person,geo,event] max_gleanings: 1 summarize_descriptions: model_id: default_chat_model prompt: "prompts/summarize_descriptions.txt" max_length: 500 extract_graph_nlp: text_analyzer: extractor_type: regex_english # [regex_english, syntactic_parser, cfg] cluster_graph: max_cluster_size: 10 extract_claims: enabled: false model_id: default_chat_model prompt: "prompts/extract_claims.txt" description: "Any claims or facts that could be relevant to information discovery." max_gleanings: 1 community_reports: model_id: default_chat_model graph_prompt: "prompts/community_report_graph.txt" text_prompt: "prompts/community_report_text.txt" max_length: 2000 max_input_length: 8000 embed_graph: enabled: false # if true, will generate node2vec embeddings for nodes umap: enabled: false # if true, will generate UMAP embeddings for nodes (embed_graph must also be enabled) snapshots: graphml: false embeddings: false ### Query settings ### ## The prompt locations are required here, but each search method has a number of optional knobs that can be tuned. ## See the config docs: https://microsoft.github.io/graphrag/config/yaml/#query local_search: chat_model_id: default_chat_model embedding_model_id: default_embedding_model prompt: "prompts/local_search_system_prompt.txt" global_search: chat_model_id: default_chat_model map_prompt: "prompts/global_search_map_system_prompt.txt" reduce_prompt: "prompts/global_search_reduce_system_prompt.txt" knowledge_prompt: "prompts/global_search_knowledge_system_prompt.txt" drift_search: chat_model_id: default_chat_model embedding_model_id: default_embedding_model prompt: "prompts/drift_search_system_prompt.txt" reduce_prompt: "prompts/drift_search_reduce_prompt.txt" basic_search: chat_model_id: default_chat_model embedding_model_id: default_embedding_model prompt: "prompts/basic_search_system_prompt.txt"