mirror of
				https://github.com/microsoft/graphrag.git
				synced 2025-10-30 17:29:50 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			367 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			HTML
		
	
	
	
	
	
			
		
		
	
	
			367 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			HTML
		
	
	
	
	
	
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| <!doctype html>
 | |
| <html lang="en">
 | |
|   <head>
 | |
|     <meta charset="utf-8">
 | |
|     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 | |
|     <title>GraphRAG Indexing 🤖</title>
 | |
|     <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.4/css/bulma.min.css">
 | |
|     <link href="https://unpkg.com/prismjs@1.20.0/themes/prism-okaidia.css" rel="stylesheet">
 | |
|     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/Primer/19.1.1/tooltips.min.css" crossorigin="anonymous" referrerpolicy="no-referrer">
 | |
|     <style>
 | |
| html {
 | |
|     padding: 0;
 | |
|     margin: 0;
 | |
| }
 | |
| 
 | |
| body{
 | |
|     font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
 | |
|     padding: 0;
 | |
|     margin: 0;
 | |
| }
 | |
| 
 | |
| footer{
 | |
|     width: 100%;
 | |
| 	height: 32px;
 | |
| 	font-size: 12px;
 | |
| 	display: flex;
 | |
| 	flex-direction: row;
 | |
| 	justify-content: center;
 | |
| 	gap: 18px;
 | |
| 	align-items: center;
 | |
| 	color: #5d5d5d;
 | |
| 	background: #e9eaeb;
 | |
| 	border-top: 1px solid #c4c5c6;
 | |
| }
 | |
| 
 | |
| #cookiesManager{
 | |
|     cursor: pointer;
 | |
|     color: #485fc7;
 | |
| }
 | |
| 
 | |
| .page-content {
 | |
|     display: flex;
 | |
|     flex-direction: row;
 | |
|     margin: 0;
 | |
|     padding: 0;
 | |
|     overflow: scroll;
 | |
|     padding: 0;
 | |
|     margin: 0;
 | |
| }
 | |
| 
 | |
| header {
 | |
|     background-color: lightgrey;
 | |
|     height: 2%;
 | |
|     padding: 10px;
 | |
| }
 | |
| 
 | |
| nav {
 | |
|     padding: 1em;
 | |
|     min-width: 200px;
 | |
| }
 | |
| 
 | |
| main {
 | |
|     flex: 1;
 | |
|     padding: 0 5em 0 5em;
 | |
| }
 | |
| 
 | |
| .logotitle {
 | |
|     font-size: 1.5em;
 | |
|     font-weight: bold;
 | |
|     margin: 5px;
 | |
| }
 | |
| 
 | |
| .number {
 | |
|     all: unset;
 | |
| }
 | |
| 
 | |
| .tag.token {
 | |
|     all: unset;
 | |
| }
 | |
| 
 | |
| main ul {
 | |
|     list-style-type: disc;
 | |
|     padding-left: 30px;
 | |
|     margin-top: 10px;
 | |
| }
 | |
| 
 | |
| h1 {
 | |
|     font-size: 2rem;
 | |
|     margin-top: 10px;
 | |
| }
 | |
| 
 | |
| h2 {
 | |
|     font-size: 1.5rem;
 | |
|     margin-top: 10px;
 | |
|     font-weight: 500;
 | |
| }
 | |
| 
 | |
| h3 {
 | |
|     font-size: 1rem;
 | |
|     margin-top: 10px; 
 | |
|     font-weight: 500;
 | |
| }
 | |
| p {
 | |
|     margin-top: 10px;
 | |
| }
 | |
| 
 | |
| /* Accessibility styling */
 | |
| 
 | |
| a {
 | |
|     color: #485fc7;
 | |
|     text-decoration: underline;
 | |
| }
 | |
| 
 | |
| .menu-list a {
 | |
|     text-decoration: none;
 | |
| }
 | |
| 
 | |
| 
 | |
| .token.comment, .token.prolog, .token.doctype, .token.cdata {
 | |
|     color: #8093a5;
 | |
| }
 | |
| 
 | |
| .token.property, .token.tag, .token.constant, .token.symbol, .token.deleted {
 | |
|     color: #ff36ab;
 | |
| }
 | |
| </style>
 | |
|     <script type="module" async="">import mermaid from "https://unpkg.com/mermaid@10/dist/mermaid.esm.min.mjs";document.addEventListener('DOMContentLoaded', mermaid.initialize({"loadOnSave":true}));</script>
 | |
|     <script>function showTooltip(o,e){o.trigger.className.includes("tooltipped")||(o.trigger.children[0].className="tooltipped tooltipped-s",o.trigger.children[0].ariaLabel=e)}window.addEventListener("load",()=>{var o=new ClipboardJS(".code-copy");o.on("success",o=>showTooltip(o,"Copied!")),o.on("error",o=>showTooltip(o,"Failed..."))});</script>
 | |
| <script async="" src="https://cdn.jsdelivr.net/npm/clipboard@2.0.11/dist/clipboard.min.js"></script>
 | |
| 
 | |
|     
 | |
|     <script src="https://wcpstatic.microsoft.com/mscc/lib/v2/wcp-consent.js" type="text/javascript"></script>
 | |
|     <script>
 | |
|         function onConsentChanged(categoryPreferences) {
 | |
|             console.log("onConsentChanged", categoryPreferences);        
 | |
|         }
 | |
| 
 | |
|         var siteConsent
 | |
| 
 | |
|         function initialize(){
 | |
|           var currentYear = new Date().getFullYear()
 | |
|           document.getElementById("copyright").innerHTML = `©️ ${currentYear} Microsoft`;
 | |
|           window.WcpConsent && WcpConsent.init("en-US", "cookie-banner", function (err, _siteConsent) {
 | |
|               if (!err) {
 | |
|                   siteConsent = _siteConsent;  //siteConsent is used to get the current consent  
 | |
|               } else {
 | |
|                   console.log("Error initializing WcpConsent: "+ err);
 | |
|               }
 | |
|           }, onConsentChanged, WcpConsent.themes.light);
 | |
|         }
 | |
| 
 | |
|         addEventListener("DOMContentLoaded", initialize)
 | |
|         addEventListener("DOMContentLoaded", checkCookieManager)
 | |
| 
 | |
|         function checkCookieManager(){
 | |
|           if(siteConsent.isConsentRequired){
 | |
|             document.getElementById("cookiesManager").style.display = 'block';
 | |
|             document.getElementById("divider").style.display = 'block';
 | |
|           }
 | |
|           else{
 | |
|             document.getElementById("cookiesManager").style.display = 'none';
 | |
|             document.getElementById("divider").style.display = 'none';
 | |
|           }
 | |
|         }
 | |
| 
 | |
|         function manageConsent() {
 | |
|         if(siteConsent.isConsentRequired){
 | |
|             siteConsent.manageConsent();
 | |
|         }
 | |
|     }
 | |
|     </script>
 | |
|     
 | |
|   </head>
 | |
|   <body>
 | |
|     <header>
 | |
|         <div id="cookie-banner"></div>
 | |
|         <a href="/graphrag/"><span class="logotitle">GraphRAG</span></a>
 | |
|     </header>
 | |
|     <div class="page-content">
 | |
|         <!-- Sidebar -->
 | |
|         <aside class="menu">
 | |
|           <ul class="menu-list">
 | |
|             <li>
 | |
|               
 | |
| <a href="/graphrag/">Welcome</a>
 | |
| 
 | |
|             </li>
 | |
| 
 | |
|             <!-- Get Started Links -->
 | |
|             <li>
 | |
|               
 | |
| <a href="/graphrag/posts/get_started/">Get Started</a>
 | |
| 
 | |
|               
 | |
| <a href="/graphrag/posts/developing/">Developing</a>
 | |
| 
 | |
|             </li>
 | |
| 
 | |
|             <!-- Indexing Links -->
 | |
|             <li>
 | |
|                 
 | |
| <a href="/graphrag/posts/index/overview/" class="is-active" aria-current="page">Indexing</a>
 | |
| 
 | |
|                 <ul><li>
 | |
| <a href="/graphrag/posts/index/0-architecture/">Architecture</a>
 | |
| </li><li>
 | |
| <a href="/graphrag/posts/index/1-default_dataflow/">Dataflow</a>
 | |
| </li><li>
 | |
| <a href="/graphrag/posts/index/2-cli/">CLI</a>
 | |
| </li><li>
 | |
| <a href="/graphrag/posts/index/3-prompt_tuning/">Prompt Tuning</a>
 | |
| </li><li>
 | |
|                     
 | |
| <a href="/graphrag/posts/config/overview/">Configuration</a>
 | |
| 
 | |
|                     <ul>
 | |
|                       <li>
 | |
| <a href="/graphrag/posts/config/env_vars">Using Env Vars</a>
 | |
| </li>
 | |
|                       <li>
 | |
| <a href="/graphrag/posts/config/json_yaml">Using JSON or YAML</a>
 | |
| </li>
 | |
|                       <li>
 | |
| <a href="/graphrag/posts/config/custom">Fully Custom</a>
 | |
| </li>
 | |
|                       <li>
 | |
| <a href="/graphrag/posts/config/template">Template</a>
 | |
| </li>
 | |
|                     </ul>
 | |
|                   </li>
 | |
|                 </ul>
 | |
|             </li>
 | |
|             
 | |
| 
 | |
|             <!-- Query Links -->
 | |
|             <li>
 | |
|               
 | |
| <a href="/graphrag/posts/query/overview/">Query</a>
 | |
| 
 | |
|               <ul><li>
 | |
| <a href="/graphrag/posts/query/0-global_search/">Global Search</a>
 | |
| </li><li>
 | |
| <a href="/graphrag/posts/query/1-local_search/">Local Search</a>
 | |
| </li><li>
 | |
| <a href="/graphrag/posts/query/2-question_generation/">Question Generation</a>
 | |
| </li><li>
 | |
| <a href="/graphrag/posts/query/3-cli/">CLI</a>
 | |
| </li><li>
 | |
|                   
 | |
| <a href="/graphrag/posts/query/notebooks/overview/">Notebooks</a>
 | |
| 
 | |
|                   <ul>
 | |
|                     <li>
 | |
| <a href="/graphrag/posts/query/notebooks/global_search_nb">Global Search</a>
 | |
| </li>
 | |
|                     <li>
 | |
| <a href="/graphrag/posts/query/notebooks/local_search_nb">Local Search</a>
 | |
| </li>
 | |
|                   </ul>
 | |
|                 </li>
 | |
|             </ul>
 | |
|             </li>
 | |
|           </ul>
 | |
|         </aside>
 | |
| 
 | |
|         <!-- Main Content -->
 | |
|         <main>
 | |
|             <h1>GraphRAG Indexing 🤖</h1>
 | |
|             <p>The GraphRAG indexing package is a data pipeline and transformation suite that is designed to extract meaningful, structured data from unstructured text using LLMs.</p>
 | |
| <p>Indexing Pipelines are configurable. They are composed of workflows, standard and custom steps, prompt templates, and input/output adapters. Our standard pipeline is designed to:</p>
 | |
| <ul>
 | |
| <li>extract entities, relationships and claims from raw text</li>
 | |
| <li>perform community detection in entities</li>
 | |
| <li>generate community summaries and reports at multiple levels of granularity</li>
 | |
| <li>embed entities into a graph vector space</li>
 | |
| <li>embed text chunks into a textual vector space</li>
 | |
| </ul>
 | |
| <p>The outputs of the pipeline can be stored in a variety of formats, including JSON and Parquet - or they can be handled manually via the Python API.</p>
 | |
| <h2>Getting Started</h2>
 | |
| <h3>Requirements</h3>
 | |
| <p>See the <a href="/graphrag/posts/developing#requirements">requirements</a> section in <a href="/graphrag/posts/get_started">Get Started</a> for details on setting up a development environment.</p>
 | |
| <p>The Indexing Engine can be used in either a default configuration mode or with a custom pipeline.
 | |
| To configure GraphRAG, see the <a href="/graphrag/posts/config/overview">configuration</a> documentation.
 | |
| After you have a config file you can run the pipeline using the CLI or the Python API.</p>
 | |
| <h2>Usage</h2>
 | |
| <h3>CLI</h3>
 | |
| 
 | |
| <div style="position: relative">
 | |
|   <pre class="language-bash"><code id="code-54" class="language-bash"><span class="token comment"># Via Poetry</span>
 | |
| poetry run poe cli <span class="token parameter variable">--root</span> <span class="token operator"><</span>data_root<span class="token operator">></span> <span class="token comment"># default config mode</span>
 | |
| poetry run poe cli <span class="token parameter variable">--config</span> your_pipeline.yml <span class="token comment"># custom config mode</span>
 | |
| 
 | |
| <span class="token comment"># Via Node</span>
 | |
| <span class="token function">yarn</span> run:index <span class="token parameter variable">--root</span> <span class="token operator"><</span>data_root<span class="token operator">></span> <span class="token comment"># default config mode</span>
 | |
| <span class="token function">yarn</span> run:index <span class="token parameter variable">--config</span> your_pipeline.yml <span class="token comment"># custom config mode</span>
 | |
| </code></pre>
 | |
| 
 | |
|   <button class="code-copy " data-clipboard-target="#code-54" style="position: absolute; top: 7.5px; right: 6px; padding-top: 3px; cursor: pointer; outline: none; opacity: 0.8;" title="Copy">
 | |
|     <span style="display:inline-block;background:url(https://api.iconify.design/mdi/content-copy.svg) no-repeat center center / contain;width: 16px; height: 16px;" class=""></span>
 | |
|   </button>
 | |
| </div>
 | |
| <h3>Python API</h3>
 | |
| 
 | |
| <div style="position: relative">
 | |
|   <pre class="language-python"><code id="code-58" class="language-python"><span class="token keyword">from</span> graphrag<span class="token punctuation">.</span>index <span class="token keyword">import</span> run_pipeline
 | |
| <span class="token keyword">from</span> graphrag<span class="token punctuation">.</span>index<span class="token punctuation">.</span>config <span class="token keyword">import</span> PipelineWorkflowReference
 | |
| 
 | |
| workflows<span class="token punctuation">:</span> <span class="token builtin">list</span><span class="token punctuation">[</span>PipelineWorkflowReference<span class="token punctuation">]</span> <span class="token operator">=</span> <span class="token punctuation">[</span>
 | |
|     PipelineWorkflowReference<span class="token punctuation">(</span>
 | |
|         steps<span class="token operator">=</span><span class="token punctuation">[</span>
 | |
|             <span class="token punctuation">{</span>
 | |
|                 <span class="token comment"># built-in verb</span>
 | |
|                 <span class="token string">"verb"</span><span class="token punctuation">:</span> <span class="token string">"derive"</span><span class="token punctuation">,</span>  <span class="token comment"># https://github.com/microsoft/datashaper/blob/main/python/datashaper/datashaper/engine/verbs/derive.py</span>
 | |
|                 <span class="token string">"args"</span><span class="token punctuation">:</span> <span class="token punctuation">{</span>
 | |
|                     <span class="token string">"column1"</span><span class="token punctuation">:</span> <span class="token string">"col1"</span><span class="token punctuation">,</span>  <span class="token comment"># from above</span>
 | |
|                     <span class="token string">"column2"</span><span class="token punctuation">:</span> <span class="token string">"col2"</span><span class="token punctuation">,</span>  <span class="token comment"># from above</span>
 | |
|                     <span class="token string">"to"</span><span class="token punctuation">:</span> <span class="token string">"col_multiplied"</span><span class="token punctuation">,</span>  <span class="token comment"># new column name</span>
 | |
|                     <span class="token string">"operator"</span><span class="token punctuation">:</span> <span class="token string">"*"</span><span class="token punctuation">,</span>  <span class="token comment"># multiply the two columns</span>
 | |
|                 <span class="token punctuation">}</span><span class="token punctuation">,</span>
 | |
|                 <span class="token comment"># Since we're trying to act on the default input, we don't need explicitly to specify an input</span>
 | |
|             <span class="token punctuation">}</span>
 | |
|         <span class="token punctuation">]</span>
 | |
|     <span class="token punctuation">)</span><span class="token punctuation">,</span>
 | |
| <span class="token punctuation">]</span>
 | |
| 
 | |
| dataset <span class="token operator">=</span> pd<span class="token punctuation">.</span>DataFrame<span class="token punctuation">(</span><span class="token punctuation">[</span><span class="token punctuation">{</span><span class="token string">"col1"</span><span class="token punctuation">:</span> <span class="token number">2</span><span class="token punctuation">,</span> <span class="token string">"col2"</span><span class="token punctuation">:</span> <span class="token number">4</span><span class="token punctuation">}</span><span class="token punctuation">,</span> <span class="token punctuation">{</span><span class="token string">"col1"</span><span class="token punctuation">:</span> <span class="token number">5</span><span class="token punctuation">,</span> <span class="token string">"col2"</span><span class="token punctuation">:</span> <span class="token number">10</span><span class="token punctuation">}</span><span class="token punctuation">]</span><span class="token punctuation">)</span>
 | |
| outputs <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
 | |
| <span class="token keyword">async</span> <span class="token keyword">for</span> output <span class="token keyword">in</span> <span class="token keyword">await</span> run_pipeline<span class="token punctuation">(</span>dataset<span class="token operator">=</span>dataset<span class="token punctuation">,</span> workflows<span class="token operator">=</span>workflows<span class="token punctuation">)</span><span class="token punctuation">:</span>
 | |
|     outputs<span class="token punctuation">.</span>append<span class="token punctuation">(</span>output
 | |
| pipeline_result <span class="token operator">=</span> outputs<span class="token punctuation">[</span><span class="token operator">-</span><span class="token number">1</span><span class="token punctuation">]</span>
 | |
| <span class="token keyword">print</span><span class="token punctuation">(</span>pipeline_result<span class="token punctuation">)</span></code></pre>
 | |
| 
 | |
|   <button class="code-copy " data-clipboard-target="#code-58" style="position: absolute; top: 7.5px; right: 6px; padding-top: 3px; cursor: pointer; outline: none; opacity: 0.8;" title="Copy">
 | |
|     <span style="display:inline-block;background:url(https://api.iconify.design/mdi/content-copy.svg) no-repeat center center / contain;width: 16px; height: 16px;" class=""></span>
 | |
|   </button>
 | |
| </div>
 | |
| <h2>Further Reading</h2>
 | |
| <ul>
 | |
| <li>To start developing within the <em>GraphRAG</em> project, see <a href="/graphrag/posts/developing/">getting started</a></li>
 | |
| <li>To understand the underlying concepts and execution model of the indexing library, see <a href="/graphrag/posts/index/0-architecture/">the architecture documentation</a></li>
 | |
| <li>To get running with a series of examples, see <a href="https://github.com/microsoft/graphrag/blob/main/examples/README.md">the examples documentation</a></li>
 | |
| <li>To read more about configuring the indexing engine, see <a href="/graphrag/posts/config/overview">the configuration documentation</a></li>
 | |
| </ul>
 | |
| 
 | |
|         </main>
 | |
|     </div>
 | |
|     <footer>
 | |
|       <a href="https://go.microsoft.com/fwlink/?LinkId=521839">Privacy</a>
 | |
|       |
 | |
|       <a href="https://go.microsoft.com/fwlink/?LinkId=2259814">Consumer Health Privacy</a>
 | |
|       |
 | |
|       <span id="cookiesManager" onClick="manageConsent();">Cookies</span>
 | |
|       <span id="divider">|</span>
 | |
|       <a href="https://go.microsoft.com/fwlink/?LinkID=206977">Terms of Use</a>
 | |
|       |
 | |
|       <a href="https://www.microsoft.com/trademarks">Trademarks</a>
 | |
|       |
 | |
|       <a href="https://www.microsoft.com" id="copyright"></a>
 | |
|       |
 | |
|       <a href="https://github.com/microsoft/graphrag">GitHub</a>
 | |
|     </footer>    
 | |
|   </body>
 | |
| </html> | 
