386 lines
14 KiB
HTML
Raw Normal View History

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Prompt Tuning ⚙️</title>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.4/css/bulma.min.css">
<link href="https://unpkg.com/prismjs@1.20.0/themes/prism-okaidia.css" rel="stylesheet">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/Primer/19.1.1/tooltips.min.css" crossorigin="anonymous" referrerpolicy="no-referrer">
<style>
html {
padding: 0;
margin: 0;
}
body{
font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
padding: 0;
margin: 0;
}
footer{
width: 100%;
height: 32px;
font-size: 12px;
display: flex;
flex-direction: row;
justify-content: center;
gap: 18px;
align-items: center;
color: #5d5d5d;
background: #e9eaeb;
border-top: 1px solid #c4c5c6;
}
#cookiesManager{
cursor: pointer;
color: #485fc7;
}
.page-content {
display: flex;
flex-direction: row;
margin: 0;
padding: 0;
overflow: scroll;
padding: 0;
margin: 0;
}
header {
background-color: lightgrey;
height: 2%;
padding: 10px;
}
nav {
padding: 1em;
min-width: 200px;
}
main {
flex: 1;
padding: 0 5em 0 5em;
}
.logotitle {
font-size: 1.5em;
font-weight: bold;
margin: 5px;
}
.number {
all: unset;
}
.tag.token {
all: unset;
}
main ul {
list-style-type: disc;
padding-left: 30px;
margin-top: 10px;
}
h1 {
font-size: 2rem;
margin-top: 10px;
}
h2 {
font-size: 1.5rem;
margin-top: 10px;
font-weight: 500;
}
h3 {
font-size: 1rem;
margin-top: 10px;
font-weight: 500;
}
p {
margin-top: 10px;
}
/* Accessibility styling */
a {
color: #485fc7;
text-decoration: underline;
}
.menu-list a {
text-decoration: none;
}
.token.comment, .token.prolog, .token.doctype, .token.cdata {
color: #8093a5;
}
.token.property, .token.tag, .token.constant, .token.symbol, .token.deleted {
color: #ff36ab;
}
</style>
<script type="module" async="">import mermaid from "https://unpkg.com/mermaid@10/dist/mermaid.esm.min.mjs";document.addEventListener('DOMContentLoaded', mermaid.initialize({"loadOnSave":true}));</script>
<script>function showTooltip(o,e){o.trigger.className.includes("tooltipped")||(o.trigger.children[0].className="tooltipped tooltipped-s",o.trigger.children[0].ariaLabel=e)}window.addEventListener("load",()=>{var o=new ClipboardJS(".code-copy");o.on("success",o=>showTooltip(o,"Copied!")),o.on("error",o=>showTooltip(o,"Failed..."))});</script>
<script async="" src="https://cdn.jsdelivr.net/npm/clipboard@2.0.11/dist/clipboard.min.js"></script>
<script src="https://wcpstatic.microsoft.com/mscc/lib/v2/wcp-consent.js" type="text/javascript"></script>
<script>
function onConsentChanged(categoryPreferences) {
console.log("onConsentChanged", categoryPreferences);
}
var siteConsent
function initialize(){
var currentYear = new Date().getFullYear()
document.getElementById("copyright").innerHTML = `©️ ${currentYear} Microsoft`;
window.WcpConsent && WcpConsent.init("en-US", "cookie-banner", function (err, _siteConsent) {
if (!err) {
siteConsent = _siteConsent; //siteConsent is used to get the current consent
} else {
console.log("Error initializing WcpConsent: "+ err);
}
}, onConsentChanged, WcpConsent.themes.light);
}
addEventListener("DOMContentLoaded", initialize)
addEventListener("DOMContentLoaded", checkCookieManager)
function checkCookieManager(){
if(siteConsent.isConsentRequired){
document.getElementById("cookiesManager").style.display = 'block';
document.getElementById("divider").style.display = 'block';
}
else{
document.getElementById("cookiesManager").style.display = 'none';
document.getElementById("divider").style.display = 'none';
}
}
function manageConsent() {
if(siteConsent.isConsentRequired){
siteConsent.manageConsent();
}
}
</script>
</head>
<body>
<header>
<div id="cookie-banner"></div>
<a href="/"><span class="logotitle">GraphRAG</span></a>
</header>
<div class="page-content">
<!-- Sidebar -->
<aside class="menu">
<ul class="menu-list">
<li>
<a href="/">Welcome</a>
</li>
<!-- Get Started Links -->
<li>
<a href="/posts/get_started/">Get Started</a>
<a href="/posts/developing/">Developing</a>
</li>
<!-- Indexing Links -->
<li>
<a href="/posts/index/overview/">Indexing</a>
<ul><li>
<a href="/posts/index/0-architecture/">Architecture</a>
</li><li>
<a href="/posts/index/1-default_dataflow/">Dataflow</a>
</li><li>
<a href="/posts/index/2-cli/">CLI</a>
</li><li>
<a href="/posts/config/overview/">Configuration</a>
<ul>
<li>
<a href="/posts/config/env_vars">Using Env Vars</a>
</li>
<li>
<a href="/posts/config/json_yaml">Using JSON or YAML</a>
</li>
<li>
<a href="/posts/config/custom">Fully Custom</a>
</li>
<li>
<a href="/posts/config/template">Template</a>
</li>
</ul>
</li>
<li>
<a href="/posts/prompt_tuning/overview/">Prompt Tuning</a>
<ul>
<li>
<a href="/posts/prompt_tuning/auto_prompt_tuning/" class="is-active" aria-current="page">Automatic Templating</a>
</li>
<li>
<a href="/posts/prompt_tuning/manual_prompt_tuning/">Manual Prompt Tuning</a>
</li>
</ul>
</li>
</ul>
</li>
<!-- Query Links -->
<li>
<a href="/posts/query/overview/">Query</a>
<ul><li>
<a href="/posts/query/1-local_search/">Local Search</a>
</li><li>
<a href="/posts/query/2-question_generation/">Question Generation</a>
</li><li>
<a href="/posts/query/0-global_search/">Global Search</a>
</li><li>
<a href="/posts/query/3-cli/">CLI</a>
</li><li>
<a href="/posts/query/notebooks/overview/">Notebooks</a>
<ul>
<li>
<a href="/posts/query/notebooks/global_search_nb">Global Search</a>
</li>
<li>
<a href="/posts/query/notebooks/local_search_nb">Local Search</a>
</li>
</ul>
</li>
</ul>
</li>
</ul>
</aside>
<!-- Main Content -->
<main>
<h1>Prompt Tuning ⚙️</h1>
<p>GraphRAG provides the ability to create domain adaptive templates for the generation of the knowledge graph. This step is optional, though is is highly encouraged to run it as it will yield better results when executing an Index Run.</p>
<p>The templates are generated by loading the inputs, splitting them into chunks (text units) and then running a series of LLM invocations and template substitutions to generate the final prompts. We suggest using the default values provided by the script, but in this page you'll find the detail of each in case you want to further explore and tweak the template generation algorithm.</p>
<h2>Usage</h2>
<p>You can run the main script from the command line with various options:</p>
<div style="position: relative">
<pre class="language-bash"><code id="code-12" class="language-bash">python <span class="token parameter variable">-m</span> graphrag.prompt_tune <span class="token punctuation">[</span>--root ROOT<span class="token punctuation">]</span> <span class="token punctuation">[</span>--domain DOMAIN<span class="token punctuation">]</span> <span class="token punctuation">[</span>--method METHOD<span class="token punctuation">]</span> <span class="token punctuation">[</span>--limit LIMIT<span class="token punctuation">]</span> <span class="token punctuation">[</span>--max-tokens MAX_TOKENS<span class="token punctuation">]</span> <span class="token punctuation">[</span>--chunk-size CHUNK_SIZE<span class="token punctuation">]</span> <span class="token punctuation">[</span>--no-entity-types<span class="token punctuation">]</span> <span class="token punctuation">[</span>--output OUTPUT<span class="token punctuation">]</span></code></pre>
<button class="code-copy " data-clipboard-target="#code-12" style="position: absolute; top: 7.5px; right: 6px; padding-top: 3px; cursor: pointer; outline: none; opacity: 0.8;" title="Copy">
<span style="display:inline-block;background:url(https://api.iconify.design/mdi/content-copy.svg) no-repeat center center / contain;width: 16px; height: 16px;" class=""></span>
</button>
</div>
<h2>Command-Line Options</h2>
<ul>
<li>
<p><code>--root</code> (optional): The data project root directory, including the config files (YML, JSON, or .env). Defaults to the current directory.</p>
</li>
<li>
<p><code>--domain</code> (optional): The domain related to your input data, such as 'space science', 'microbiology', or 'environmental news'. If left empty, the domain will be inferred from the input data.</p>
</li>
<li>
<p><code>--method</code> (optional): The method to select documents. Options are all, random, or top. Default is random.</p>
</li>
<li>
<p><code>--limit</code> (optional): The limit of text units to load when using random or top selection. Default is 15.</p>
</li>
<li>
<p><code>--max-tokens</code> (optional): Maximum token count for prompt generation. Default is 2000.</p>
</li>
<li>
<p><code>--chunk-size</code> (optional): The size in tokens to use for generating text units from input documents. Default is 200.</p>
</li>
<li>
<p><code>--no-entity-types</code> (optional): Use untyped entity extraction generation. We recommend using this when your data covers a lot of topics or it is highly randomized.</p>
</li>
<li>
<p><code>--output</code> (optional): The folder to save the generated prompts. Default is &quot;prompts&quot;.</p>
</li>
</ul>
<h2>Example Usage</h2>
<div style="position: relative">
<pre class="language-bash"><code id="code-61" class="language-bash">python <span class="token parameter variable">-m</span> graphrag.prompt_tune <span class="token parameter variable">--root</span> /path/to/project <span class="token parameter variable">--domain</span> <span class="token string">"environmental news"</span> <span class="token parameter variable">--method</span> random <span class="token parameter variable">--limit</span> <span class="token number">10</span> --max-tokens <span class="token number">2048</span> --chunk-size <span class="token number">256</span> --no-entity-types <span class="token parameter variable">--output</span> /path/to/output</code></pre>
<button class="code-copy " data-clipboard-target="#code-61" style="position: absolute; top: 7.5px; right: 6px; padding-top: 3px; cursor: pointer; outline: none; opacity: 0.8;" title="Copy">
<span style="display:inline-block;background:url(https://api.iconify.design/mdi/content-copy.svg) no-repeat center center / contain;width: 16px; height: 16px;" class=""></span>
</button>
</div>
<p>or, with minimal configuration (suggested):</p>
<div style="position: relative">
<pre class="language-bash"><code id="code-65" class="language-bash">python <span class="token parameter variable">-m</span> graphrag.prompt_tune <span class="token parameter variable">--root</span> /path/to/project --no-entity-types</code></pre>
<button class="code-copy " data-clipboard-target="#code-65" style="position: absolute; top: 7.5px; right: 6px; padding-top: 3px; cursor: pointer; outline: none; opacity: 0.8;" title="Copy">
<span style="display:inline-block;background:url(https://api.iconify.design/mdi/content-copy.svg) no-repeat center center / contain;width: 16px; height: 16px;" class=""></span>
</button>
</div>
<h2>Document Selection Methods</h2>
<p>The auto template feature ingests the input data and then divides it into text units the size of the chunk size parameter.
After that, it uses one of the following selection methods to pick a sample to work with for template generation:</p>
<ul>
<li><code>random</code>: Select text units randomly. This is the default and recommended option.</li>
<li><code>top</code>: Select the head n text units.</li>
<li><code>all</code>: Use all text units for the generation. Use only with small datasets; this option is not usually recommended.</li>
</ul>
<h2>Modify Env Vars</h2>
<p>After running auto-templating, you should modify the following environment variables (or config variables) to pick up the new prompts on your index run. Note: Please make sure to update the correct path to the generated prompts, in this example we are using the default &quot;prompts&quot; path.</p>
<ul>
<li>
<p><code>GRAPHRAG_ENTITY_EXTRACTION_PROMPT_FILE</code> = &quot;prompts/entity_extraction.txt&quot;</p>
</li>
<li>
<p><code>GRAPHRAG_COMMUNITY_REPORT_PROMPT_FILE</code> = &quot;prompts/community_report.txt&quot;</p>
</li>
<li>
<p><code>GRAPHRAG_SUMMARIZE_DESCRIPTIONS_PROMPT_FILE</code> = &quot;prompts/summarize_descriptions.txt&quot;</p>
</li>
</ul>
</main>
</div>
<footer>
<a href="https://go.microsoft.com/fwlink/?LinkId=521839">Privacy</a>
|
<a href="https://go.microsoft.com/fwlink/?LinkId=2259814">Consumer Health Privacy</a>
|
<span id="cookiesManager" onClick="manageConsent();">Cookies</span>
<span id="divider">|</span>
<a href="https://go.microsoft.com/fwlink/?LinkID=206977">Terms of Use</a>
|
<a href="https://www.microsoft.com/trademarks">Trademarks</a>
|
<a href="https://www.microsoft.com" id="copyright"></a>
|
<a href="https://github.com/microsoft/graphrag">GitHub</a>
</footer>
</body>
</html>