492 lines
24 KiB
HTML

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Custom Configuration Mode</title>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.4/css/bulma.min.css">
<link href="https://unpkg.com/prismjs@1.20.0/themes/prism-okaidia.css" rel="stylesheet">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/Primer/19.1.1/tooltips.min.css" crossorigin="anonymous" referrerpolicy="no-referrer">
<style>
html {
padding: 0;
margin: 0;
}
body{
font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
padding: 0;
margin: 0;
}
footer{
width: 100%;
height: 32px;
font-size: 12px;
display: flex;
flex-direction: row;
justify-content: center;
gap: 18px;
align-items: center;
color: #5d5d5d;
background: #e9eaeb;
border-top: 1px solid #c4c5c6;
}
#cookiesManager{
cursor: pointer;
color: #485fc7;
}
.page-content {
display: flex;
flex-direction: row;
margin: 0;
padding: 0;
overflow: scroll;
padding: 0;
margin: 0;
}
header {
background-color: lightgrey;
height: 2%;
padding: 10px;
}
nav {
padding: 1em;
min-width: 200px;
}
main {
flex: 1;
padding: 0 5em 0 5em;
}
.logotitle {
font-size: 1.5em;
font-weight: bold;
margin: 5px;
}
.number {
all: unset;
}
.tag.token {
all: unset;
}
main ul {
list-style-type: disc;
padding-left: 30px;
margin-top: 10px;
}
h1 {
font-size: 2rem;
margin-top: 10px;
}
h2 {
font-size: 1.5rem;
margin-top: 10px;
font-weight: 500;
}
h3 {
font-size: 1rem;
margin-top: 10px;
font-weight: 500;
}
p {
margin-top: 10px;
}
/* Accessibility styling */
a {
color: #485fc7;
text-decoration: underline;
}
.menu-list a {
text-decoration: none;
}
.token.comment, .token.prolog, .token.doctype, .token.cdata {
color: #8093a5;
}
.token.property, .token.tag, .token.constant, .token.symbol, .token.deleted {
color: #ff36ab;
}
</style>
<script type="module" async="">import mermaid from "https://unpkg.com/mermaid@10/dist/mermaid.esm.min.mjs";document.addEventListener('DOMContentLoaded', mermaid.initialize({"loadOnSave":true}));</script>
<script>function showTooltip(o,e){o.trigger.className.includes("tooltipped")||(o.trigger.children[0].className="tooltipped tooltipped-s",o.trigger.children[0].ariaLabel=e)}window.addEventListener("load",()=>{var o=new ClipboardJS(".code-copy");o.on("success",o=>showTooltip(o,"Copied!")),o.on("error",o=>showTooltip(o,"Failed..."))});</script>
<script async="" src="https://cdn.jsdelivr.net/npm/clipboard@2.0.11/dist/clipboard.min.js"></script>
<script src="https://wcpstatic.microsoft.com/mscc/lib/v2/wcp-consent.js" type="text/javascript"></script>
<script>
function onConsentChanged(categoryPreferences) {
console.log("onConsentChanged", categoryPreferences);
}
var siteConsent
function initialize(){
var currentYear = new Date().getFullYear()
document.getElementById("copyright").innerHTML = `©️ ${currentYear} Microsoft`;
window.WcpConsent && WcpConsent.init("en-US", "cookie-banner", function (err, _siteConsent) {
if (!err) {
siteConsent = _siteConsent; //siteConsent is used to get the current consent
} else {
console.log("Error initializing WcpConsent: "+ err);
}
}, onConsentChanged, WcpConsent.themes.light);
}
addEventListener("DOMContentLoaded", initialize)
addEventListener("DOMContentLoaded", checkCookieManager)
function checkCookieManager(){
if(siteConsent.isConsentRequired){
document.getElementById("cookiesManager").style.display = 'block';
document.getElementById("divider").style.display = 'block';
}
else{
document.getElementById("cookiesManager").style.display = 'none';
document.getElementById("divider").style.display = 'none';
}
}
function manageConsent() {
if(siteConsent.isConsentRequired){
siteConsent.manageConsent();
}
}
</script>
</head>
<body>
<header>
<div id="cookie-banner"></div>
<a href="/"><span class="logotitle">GraphRAG</span></a>
</header>
<div class="page-content">
<!-- Sidebar -->
<aside class="menu">
<ul class="menu-list">
<li>
<a href="/">Welcome</a>
</li>
<!-- Get Started Links -->
<li>
<a href="/posts/get_started/">Get Started</a>
<a href="/posts/developing/">Developing</a>
</li>
<!-- Indexing Links -->
<li>
<a href="/posts/index/overview/">Indexing</a>
<ul><li>
<a href="/posts/index/0-architecture/">Architecture</a>
</li><li>
<a href="/posts/index/1-default_dataflow/">Dataflow</a>
</li><li>
<a href="/posts/index/2-cli/">CLI</a>
</li><li>
<a href="/posts/config/overview/">Configuration</a>
<ul>
<li>
<a href="/posts/config/init">Init command</a>
</li>
<li>
<a href="/posts/config/env_vars">Using Env Vars</a>
</li>
<li>
<a href="/posts/config/json_yaml">Using JSON or YAML</a>
</li>
<li>
<a href="/posts/config/custom">Fully Custom</a>
</li>
<li>
<a href="/posts/config/template">Template</a>
</li>
</ul>
</li>
<li>
<a href="/posts/prompt_tuning/overview/">Prompt Tuning</a>
<ul>
<li>
<a href="/posts/prompt_tuning/auto_prompt_tuning/">Automatic Templating</a>
</li>
<li>
<a href="/posts/prompt_tuning/manual_prompt_tuning/">Manual Prompt Tuning</a>
</li>
</ul>
</li>
</ul>
</li>
<!-- Query Links -->
<li>
<a href="/posts/query/overview/">Query</a>
<ul><li>
<a href="/posts/query/1-local_search/">Local Search</a>
</li><li>
<a href="/posts/query/2-question_generation/">Question Generation</a>
</li><li>
<a href="/posts/query/0-global_search/">Global Search</a>
</li><li>
<a href="/posts/query/3-cli/">CLI</a>
</li><li>
<a href="/posts/query/notebooks/overview/">Notebooks</a>
<ul>
<li>
<a href="/posts/query/notebooks/global_search_nb">Global Search</a>
</li>
<li>
<a href="/posts/query/notebooks/local_search_nb">Local Search</a>
</li>
</ul>
</li>
</ul>
</li>
</ul>
</aside>
<!-- Main Content -->
<main>
<h1>Custom Configuration Mode</h1>
<p>The primary configuration sections for Indexing Engine pipelines are described below. Each configuration section can be expressed in Python (for use in Python API mode) as well as YAML, but YAML is show here for brevity.</p>
<p>Using custom configuration is an advanced use-case. Most users will want to use the <a href="/posts/config/overview">Default Configuration</a> instead.</p>
<h2>Indexing Engine Examples</h2>
<p>The <a href="https://github.com/microsoft/graphrag/blob/main/examples/">examples</a> directory contains several examples of how to use the indexing engine with <em>custom configuration</em>.</p>
<p>Most examples include two different forms of running the pipeline, both are contained in the examples <code>run.py</code></p>
<ol>
<li>Using mostly the Python API</li>
<li>Using mostly the a pipeline configuration file</li>
</ol>
<p>To run an example:</p>
<ul>
<li>Run <code>poetry shell</code> to activate a virtual environment with the required dependencies.</li>
<li>Run <code>PYTHONPATH=&quot;$(pwd)&quot; python examples/path_to_example/run.py</code> from the <code>root</code> directory.</li>
</ul>
<p>For example to run the single_verb example, you would run the following commands:</p>
<div style="position: relative">
<pre class="language-bash"><code id="code-45" class="language-bash">poetry shell</code></pre>
<button class="code-copy " data-clipboard-target="#code-45" style="position: absolute; top: 7.5px; right: 6px; padding-top: 3px; cursor: pointer; outline: none; opacity: 0.8;" title="Copy">
<span style="display:inline-block;background:url(https://api.iconify.design/mdi/content-copy.svg) no-repeat center center / contain;width: 16px; height: 16px;" class=""></span>
</button>
</div>
<div style="position: relative">
<pre class="language-sh"><code id="code-46" class="language-sh"><span class="token assign-left variable">PYTHONPATH</span><span class="token operator">=</span><span class="token string">"<span class="token variable"><span class="token variable">$(</span><span class="token builtin class-name">pwd</span><span class="token variable">)</span></span>"</span> python examples/single_verb/run.py</code></pre>
<button class="code-copy " data-clipboard-target="#code-46" style="position: absolute; top: 7.5px; right: 6px; padding-top: 3px; cursor: pointer; outline: none; opacity: 0.8;" title="Copy">
<span style="display:inline-block;background:url(https://api.iconify.design/mdi/content-copy.svg) no-repeat center center / contain;width: 16px; height: 16px;" class=""></span>
</button>
</div>
<h1>Configuration Sections</h1>
<h1>&gt; extends</h1>
<p>This configuration allows you to extend a base configuration file or files.</p>
<div style="position: relative">
<pre class="language-yaml"><code id="code-56" class="language-yaml"><span class="token comment"># single base</span>
<span class="token key atrule">extends</span><span class="token punctuation">:</span> ../base_config.yml</code></pre>
<button class="code-copy " data-clipboard-target="#code-56" style="position: absolute; top: 7.5px; right: 6px; padding-top: 3px; cursor: pointer; outline: none; opacity: 0.8;" title="Copy">
<span style="display:inline-block;background:url(https://api.iconify.design/mdi/content-copy.svg) no-repeat center center / contain;width: 16px; height: 16px;" class=""></span>
</button>
</div>
<div style="position: relative">
<pre class="language-yaml"><code id="code-57" class="language-yaml"><span class="token comment"># multiple bases</span>
<span class="token key atrule">extends</span><span class="token punctuation">:</span>
<span class="token punctuation">-</span> ../base_config.yml
<span class="token punctuation">-</span> ../base_config2.yml</code></pre>
<button class="code-copy " data-clipboard-target="#code-57" style="position: absolute; top: 7.5px; right: 6px; padding-top: 3px; cursor: pointer; outline: none; opacity: 0.8;" title="Copy">
<span style="display:inline-block;background:url(https://api.iconify.design/mdi/content-copy.svg) no-repeat center center / contain;width: 16px; height: 16px;" class=""></span>
</button>
</div>
<h1>&gt; root_dir</h1>
<p>This configuration allows you to set the root directory for the pipeline. All data inputs and outputs are assumed to be relative to this path.</p>
<div style="position: relative">
<pre class="language-yaml"><code id="code-64" class="language-yaml"><span class="token key atrule">root_dir</span><span class="token punctuation">:</span> /workspace/data_project</code></pre>
<button class="code-copy " data-clipboard-target="#code-64" style="position: absolute; top: 7.5px; right: 6px; padding-top: 3px; cursor: pointer; outline: none; opacity: 0.8;" title="Copy">
<span style="display:inline-block;background:url(https://api.iconify.design/mdi/content-copy.svg) no-repeat center center / contain;width: 16px; height: 16px;" class=""></span>
</button>
</div>
<h1>&gt; storage</h1>
<p>This configuration allows you define the output strategy for the pipeline.</p>
<ul>
<li><code>type</code>: The type of storage to use. Options are <code>file</code>, <code>memory</code>, and <code>blob</code></li>
<li><code>base_dir</code> (<code>type: file</code> only): The base directory to store the data in. This is relative to the config root.</li>
<li><code>connection_string</code> (<code>type: blob</code> only): The connection string to use for blob storage.</li>
<li><code>container_name</code> (<code>type: blob</code> only): The container to use for blob storage.</li>
</ul>
<h1>&gt; cache</h1>
<p>This configuration allows you define the cache strategy for the pipeline.</p>
<ul>
<li><code>type</code>: The type of cache to use. Options are <code>file</code> and <code>memory</code>, and <code>blob</code>.</li>
<li><code>base_dir</code> (<code>type: file</code> only): The base directory to store the cache in. This is relative to the config root.</li>
<li><code>connection_string</code> (<code>type: blob</code> only): The connection string to use for blob storage.</li>
<li><code>container_name</code> (<code>type: blob</code> only): The container to use for blob storage.</li>
</ul>
<h1>&gt; reporting</h1>
<p>This configuration allows you define the reporting strategy for the pipeline. Report files are generated artifacts that summarize the performance metrics of the pipeline and emit any error messages.</p>
<ul>
<li><code>type</code>: The type of reporting to use. Options are <code>file</code>, <code>memory</code>, and <code>blob</code></li>
<li><code>base_dir</code> (<code>type: file</code> only): The base directory to store the reports in. This is relative to the config root.</li>
<li><code>connection_string</code> (<code>type: blob</code> only): The connection string to use for blob storage.</li>
<li><code>container_name</code> (<code>type: blob</code> only): The container to use for blob storage.</li>
</ul>
<h1>&gt; workflows</h1>
<p>This configuration section defines the workflow DAG for the pipeline. Here we define an array of workflows and express their inter-dependencies in steps:</p>
<ul>
<li><code>name</code>: The name of the workflow. This is used to reference the workflow in other parts of the config.</li>
<li><code>steps</code>: The DataShaper steps that this workflow comprises. If a step defines an input in the form of <code>workflow:&lt;workflow_name&gt;</code>, then it is assumed to have a dependency on the output of that workflow.</li>
</ul>
<div style="position: relative">
<pre class="language-yaml"><code id="code-167" class="language-yaml"><span class="token key atrule">workflows</span><span class="token punctuation">:</span>
<span class="token punctuation">-</span> <span class="token key atrule">name</span><span class="token punctuation">:</span> workflow1
<span class="token key atrule">steps</span><span class="token punctuation">:</span>
<span class="token punctuation">-</span> <span class="token key atrule">verb</span><span class="token punctuation">:</span> derive
<span class="token key atrule">args</span><span class="token punctuation">:</span>
<span class="token key atrule">column1</span><span class="token punctuation">:</span> <span class="token string">"col1"</span>
<span class="token key atrule">column2</span><span class="token punctuation">:</span> <span class="token string">"col2"</span>
<span class="token punctuation">-</span> <span class="token key atrule">name</span><span class="token punctuation">:</span> workflow2
<span class="token key atrule">steps</span><span class="token punctuation">:</span>
<span class="token punctuation">-</span> <span class="token key atrule">verb</span><span class="token punctuation">:</span> derive
<span class="token key atrule">args</span><span class="token punctuation">:</span>
<span class="token key atrule">column1</span><span class="token punctuation">:</span> <span class="token string">"col1"</span>
<span class="token key atrule">column2</span><span class="token punctuation">:</span> <span class="token string">"col2"</span>
<span class="token key atrule">input</span><span class="token punctuation">:</span>
<span class="token comment"># dependency established here</span>
<span class="token key atrule">source</span><span class="token punctuation">:</span> workflow<span class="token punctuation">:</span>workflow1</code></pre>
<button class="code-copy " data-clipboard-target="#code-167" style="position: absolute; top: 7.5px; right: 6px; padding-top: 3px; cursor: pointer; outline: none; opacity: 0.8;" title="Copy">
<span style="display:inline-block;background:url(https://api.iconify.design/mdi/content-copy.svg) no-repeat center center / contain;width: 16px; height: 16px;" class=""></span>
</button>
</div>
<h1>&gt; input</h1>
<ul>
<li><code>type</code>: The type of input to use. Options are <code>file</code> or <code>blob</code>.</li>
<li><code>file_type</code>: The file type field discriminates between the different input types. Options are <code>csv</code> and <code>text</code>.</li>
<li><code>base_dir</code>: The base directory to read the input files from. This is relative to the config file.</li>
<li><code>file_pattern</code>: A regex to match the input files. The regex must have named groups for each of the fields in the file_filter.</li>
<li><code>post_process</code>: A DataShaper workflow definition to apply to the input before executing the primary workflow.</li>
<li><code>source_column</code> (<code>type: csv</code> only): The column containing the source/author of the data</li>
<li><code>text_column</code> (<code>type: csv</code> only): The column containing the text of the data</li>
<li><code>timestamp_column</code> (<code>type: csv</code> only): The column containing the timestamp of the data</li>
<li><code>timestamp_format</code> (<code>type: csv</code> only): The format of the timestamp</li>
</ul>
<div style="position: relative">
<pre class="language-yaml"><code id="code-218" class="language-yaml"><span class="token key atrule">input</span><span class="token punctuation">:</span>
<span class="token key atrule">type</span><span class="token punctuation">:</span> file
<span class="token key atrule">file_type</span><span class="token punctuation">:</span> csv
<span class="token key atrule">base_dir</span><span class="token punctuation">:</span> ../data/csv <span class="token comment"># the directory containing the CSV files, this is relative to the config file</span>
<span class="token key atrule">file_pattern</span><span class="token punctuation">:</span> <span class="token string">'.*[\/](?P&lt;source>[^\/]+)[\/](?P&lt;year>\d{4})-(?P&lt;month>\d{2})-(?P&lt;day>\d{2})_(?P&lt;author>[^_]+)_\d+\.csv$'</span> <span class="token comment"># a regex to match the CSV files</span>
<span class="token comment"># An additional file filter which uses the named groups from the file_pattern to further filter the files</span>
<span class="token comment"># file_filter:</span>
<span class="token comment"># # source: (source_filter)</span>
<span class="token comment"># year: (2023)</span>
<span class="token comment"># month: (06)</span>
<span class="token comment"># # day: (22)</span>
<span class="token key atrule">source_column</span><span class="token punctuation">:</span> <span class="token string">"author"</span> <span class="token comment"># the column containing the source/author of the data</span>
<span class="token key atrule">text_column</span><span class="token punctuation">:</span> <span class="token string">"message"</span> <span class="token comment"># the column containing the text of the data</span>
<span class="token key atrule">timestamp_column</span><span class="token punctuation">:</span> <span class="token string">"date(yyyyMMddHHmmss)"</span> <span class="token comment"># optional, the column containing the timestamp of the data</span>
<span class="token key atrule">timestamp_format</span><span class="token punctuation">:</span> <span class="token string">"%Y%m%d%H%M%S"</span> <span class="token comment"># optional, the format of the timestamp</span>
<span class="token key atrule">post_process</span><span class="token punctuation">:</span> <span class="token comment"># Optional, set of steps to process the data before going into the workflow</span>
<span class="token punctuation">-</span> <span class="token key atrule">verb</span><span class="token punctuation">:</span> filter
<span class="token key atrule">args</span><span class="token punctuation">:</span>
<span class="token key atrule">column</span><span class="token punctuation">:</span> <span class="token string">"title"</span><span class="token punctuation">,</span>
<span class="token key atrule">value</span><span class="token punctuation">:</span> <span class="token string">"My document"</span></code></pre>
<button class="code-copy " data-clipboard-target="#code-218" style="position: absolute; top: 7.5px; right: 6px; padding-top: 3px; cursor: pointer; outline: none; opacity: 0.8;" title="Copy">
<span style="display:inline-block;background:url(https://api.iconify.design/mdi/content-copy.svg) no-repeat center center / contain;width: 16px; height: 16px;" class=""></span>
</button>
</div>
<div style="position: relative">
<pre class="language-yaml"><code id="code-219" class="language-yaml"><span class="token key atrule">input</span><span class="token punctuation">:</span>
<span class="token key atrule">type</span><span class="token punctuation">:</span> file
<span class="token key atrule">file_type</span><span class="token punctuation">:</span> csv
<span class="token key atrule">base_dir</span><span class="token punctuation">:</span> ../data/csv <span class="token comment"># the directory containing the CSV files, this is relative to the config file</span>
<span class="token key atrule">file_pattern</span><span class="token punctuation">:</span> <span class="token string">'.*[\/](?P&lt;source>[^\/]+)[\/](?P&lt;year>\d{4})-(?P&lt;month>\d{2})-(?P&lt;day>\d{2})_(?P&lt;author>[^_]+)_\d+\.csv$'</span> <span class="token comment"># a regex to match the CSV files</span>
<span class="token comment"># An additional file filter which uses the named groups from the file_pattern to further filter the files</span>
<span class="token comment"># file_filter:</span>
<span class="token comment"># # source: (source_filter)</span>
<span class="token comment"># year: (2023)</span>
<span class="token comment"># month: (06)</span>
<span class="token comment"># # day: (22)</span>
<span class="token key atrule">post_process</span><span class="token punctuation">:</span> <span class="token comment"># Optional, set of steps to process the data before going into the workflow</span>
<span class="token punctuation">-</span> <span class="token key atrule">verb</span><span class="token punctuation">:</span> filter
<span class="token key atrule">args</span><span class="token punctuation">:</span>
<span class="token key atrule">column</span><span class="token punctuation">:</span> <span class="token string">"title"</span><span class="token punctuation">,</span>
<span class="token key atrule">value</span><span class="token punctuation">:</span> <span class="token string">"My document"</span></code></pre>
<button class="code-copy " data-clipboard-target="#code-219" style="position: absolute; top: 7.5px; right: 6px; padding-top: 3px; cursor: pointer; outline: none; opacity: 0.8;" title="Copy">
<span style="display:inline-block;background:url(https://api.iconify.design/mdi/content-copy.svg) no-repeat center center / contain;width: 16px; height: 16px;" class=""></span>
</button>
</div>
</main>
</div>
<footer>
<a href="https://go.microsoft.com/fwlink/?LinkId=521839">Privacy</a>
|
<a href="https://go.microsoft.com/fwlink/?LinkId=2259814">Consumer Health Privacy</a>
|
<span id="cookiesManager" onClick="manageConsent();">Cookies</span>
<span id="divider">|</span>
<a href="https://go.microsoft.com/fwlink/?LinkID=206977">Terms of Use</a>
|
<a href="https://www.microsoft.com/trademarks">Trademarks</a>
|
<a href="https://www.microsoft.com" id="copyright"></a>
|
<a href="https://github.com/microsoft/graphrag">GitHub</a>
|
<a href="https://github.com/Azure-Samples/graphrag-accelerator">Solution Accelerator</a>
</footer>
</body>
</html>