2024-04-04 01:23:24 +00:00
<!doctype html>
< html lang = "en" >
< head >
< meta charset = "utf-8" >
< meta name = "viewport" content = "width=device-width, initial-scale=1.0" >
< title > GraphRAG Indexing 🤖< / title >
< link rel = "stylesheet" href = "https://cdn.jsdelivr.net/npm/bulma@0.9.4/css/bulma.min.css" >
< link href = "https://unpkg.com/prismjs@1.20.0/themes/prism-okaidia.css" rel = "stylesheet" >
2024-04-04 15:56:27 +00:00
< link rel = "stylesheet" href = "https://cdnjs.cloudflare.com/ajax/libs/Primer/19.1.1/tooltips.min.css" crossorigin = "anonymous" referrerpolicy = "no-referrer" >
2024-04-04 01:23:24 +00:00
< style >
html {
padding: 0;
margin: 0;
}
body{
font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
padding: 0;
margin: 0;
}
footer{
width: 100%;
height: 32px;
font-size: 12px;
display: flex;
flex-direction: row;
justify-content: center;
gap: 18px;
align-items: center;
color: #5d5d5d;
background: #e9eaeb;
border-top: 1px solid #c4c5c6;
}
#cookiesManager{
cursor: pointer;
color: #485fc7;
}
.page-content {
display: flex;
flex-direction: row;
margin: 0;
padding: 0;
overflow: scroll;
padding: 0;
margin: 0;
}
header {
background-color: lightgrey;
height: 2%;
padding: 10px;
}
nav {
padding: 1em;
min-width: 200px;
}
main {
flex: 1;
padding: 0 5em 0 5em;
}
.logotitle {
font-size: 1.5em;
font-weight: bold;
margin: 5px;
}
.number {
all: unset;
}
.tag.token {
all: unset;
}
main ul {
list-style-type: disc;
padding-left: 30px;
margin-top: 10px;
}
h1 {
font-size: 2rem;
margin-top: 10px;
}
h2 {
font-size: 1.5rem;
margin-top: 10px;
font-weight: 500;
}
h3 {
font-size: 1rem;
margin-top: 10px;
font-weight: 500;
}
p {
margin-top: 10px;
}
2024-04-04 18:26:16 +00:00
/* Accessibility styling */
a {
color: #485fc7;
text-decoration: underline;
}
.menu-list a {
text-decoration: none;
}
.token.comment, .token.prolog, .token.doctype, .token.cdata {
color: #8093a5;
}
.token.property, .token.tag, .token.constant, .token.symbol, .token.deleted {
color: #ff36ab;
}
2024-04-04 01:23:24 +00:00
< / style >
2024-04-04 15:56:27 +00:00
< script type = "module" async = "" > import mermaid from "https://unpkg.com/mermaid@10/dist/mermaid.esm.min.mjs" ; document . addEventListener ( 'DOMContentLoaded' , mermaid . initialize ( { "loadOnSave" : true } ) ) ; < / script >
2024-04-04 01:23:24 +00:00
< script > function showTooltip ( o , e ) { o . trigger . className . includes ( "tooltipped" ) || ( o . trigger . children [ 0 ] . className = "tooltipped tooltipped-s" , o . trigger . children [ 0 ] . ariaLabel = e ) } window . addEventListener ( "load" , ( ) => { var o = new ClipboardJS ( ".code-copy" ) ; o . on ( "success" , o => showTooltip ( o , "Copied!" ) ) , o . on ( "error" , o => showTooltip ( o , "Failed..." ) ) } ) ; < / script >
2024-04-04 15:56:27 +00:00
< script async = "" src = "https://cdn.jsdelivr.net/npm/clipboard@2.0.11/dist/clipboard.min.js" > < / script >
2024-04-04 01:23:24 +00:00
< script src = "https://wcpstatic.microsoft.com/mscc/lib/v2/wcp-consent.js" type = "text/javascript" > < / script >
< script >
function onConsentChanged(categoryPreferences) {
console.log("onConsentChanged", categoryPreferences);
}
var siteConsent
function initialize(){
var currentYear = new Date().getFullYear()
document.getElementById("copyright").innerHTML = `©️ ${currentYear} Microsoft`;
window.WcpConsent & & WcpConsent.init("en-US", "cookie-banner", function (err, _siteConsent) {
if (!err) {
siteConsent = _siteConsent; //siteConsent is used to get the current consent
} else {
console.log("Error initializing WcpConsent: "+ err);
}
}, onConsentChanged, WcpConsent.themes.light);
}
addEventListener("DOMContentLoaded", initialize)
function manageConsent() {
if(siteConsent.isConsentRequired){
siteConsent.manageConsent();
}
}
< / script >
< / head >
< body >
< header >
< div id = "cookie-banner" > < / div >
2024-04-04 15:56:27 +00:00
< a href = "/graphrag/" > < span class = "logotitle" > GraphRAG< / span > < / a >
2024-04-04 01:23:24 +00:00
< / header >
< div class = "page-content" >
<!-- Sidebar -->
< aside class = "menu" >
< ul class = "menu-list" >
< li >
2024-04-04 15:56:27 +00:00
< a href = "/graphrag/" > Welcome< / a >
2024-04-04 01:23:24 +00:00
< / li >
<!-- Get Started Links -->
< li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/get_started/" > Get Started< / a >
2024-04-04 01:23:24 +00:00
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/developing/" > Developing< / a >
2024-04-04 01:23:24 +00:00
< / li >
<!-- Indexing Links -->
< li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/index/overview/" class = "is-active" aria-current = "page" > Indexing< / a >
2024-04-04 01:23:24 +00:00
< ul > < li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/index/0-architecture/" > Architecture< / a >
2024-04-04 01:23:24 +00:00
< / li > < li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/index/1-default_dataflow/" > Dataflow< / a >
2024-04-04 16:18:40 +00:00
< / li > < li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/index/2-cli/" > CLI< / a >
2024-04-04 01:23:24 +00:00
< / li > < li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/index/workflows/overview/" > Workflows< / a >
2024-04-04 01:23:24 +00:00
2024-04-04 15:56:27 +00:00
< ul hidden = "" > < li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/index/workflows/create_base_documents/" > create_base_documents< / a >
2024-04-04 01:23:24 +00:00
< / li > < li >
2024-04-04 17:52:55 +00:00
< a href = "/graphrag/posts/index/workflows/create_base_entity_graph/" > create_base_entity_graph< / a >
2024-04-04 01:23:24 +00:00
< / li > < li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/index/workflows/create_base_extracted_entities/" > create_base_extracted_entities< / a >
2024-04-04 04:23:38 +00:00
< / li > < li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/index/workflows/create_base_text_units/" > create_base_text_units< / a >
2024-04-04 16:33:38 +00:00
< / li > < li >
2024-04-04 17:52:55 +00:00
< a href = "/graphrag/posts/index/workflows/create_final_communities/" > create_final_communities< / a >
2024-04-04 01:38:33 +00:00
< / li > < li >
2024-04-04 17:52:55 +00:00
< a href = "/graphrag/posts/index/workflows/create_final_community_reports/" > create_final_community_reports< / a >
2024-04-04 16:33:38 +00:00
< / li > < li >
2024-04-04 18:26:16 +00:00
< a href = "/graphrag/posts/index/workflows/create_final_covariates/" > create_final_covariates< / a >
< / li > < li >
2024-04-04 17:52:55 +00:00
< a href = "/graphrag/posts/index/workflows/create_final_documents/" > create_final_documents< / a >
2024-04-04 16:33:38 +00:00
< / li > < li >
2024-04-04 18:26:16 +00:00
< a href = "/graphrag/posts/index/workflows/create_final_entities/" > create_final_entities< / a >
< / li > < li >
< a href = "/graphrag/posts/index/workflows/create_final_nodes/" > create_final_nodes< / a >
< / li > < li >
2024-04-04 17:52:55 +00:00
< a href = "/graphrag/posts/index/workflows/create_final_relationships/" > create_final_relationships< / a >
2024-04-04 16:33:38 +00:00
< / li > < li >
2024-04-04 18:26:16 +00:00
< a href = "/graphrag/posts/index/workflows/create_final_text_units/" > create_final_text_units< / a >
< / li > < li >
2024-04-04 17:52:55 +00:00
< a href = "/graphrag/posts/index/workflows/create_summarized_entities/" > create_summarized_entities< / a >
2024-04-04 01:23:24 +00:00
< / li > < / ul >
< / li >
< li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/index/verbs/overview/" > Verbs< / a >
2024-04-04 01:23:24 +00:00
2024-04-04 15:56:27 +00:00
< ul hidden = "" > < li >
2024-04-04 18:26:16 +00:00
< a href = "/graphrag/posts/index/verbs/aggregate/" > aggregate< / a >
2024-04-04 04:10:45 +00:00
< / li > < li >
2024-04-04 18:26:16 +00:00
< a href = "/graphrag/posts/index/verbs/chunk/" > chunk< / a >
2024-04-04 01:23:24 +00:00
< / li > < li >
2024-04-04 18:26:16 +00:00
< a href = "/graphrag/posts/index/verbs/cluster_graph/" > cluster_graph< / a >
2024-04-04 01:23:24 +00:00
< / li > < li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/index/verbs/concat/" > concat< / a >
2024-04-04 16:33:38 +00:00
< / li > < li >
2024-04-04 18:26:16 +00:00
< a href = "/graphrag/posts/index/verbs/create_graph/" > create_graph< / a >
2024-04-04 04:23:38 +00:00
< / li > < li >
2024-04-04 18:26:16 +00:00
< a href = "/graphrag/posts/index/verbs/genid/" > genid< / a >
2024-04-04 16:18:40 +00:00
< / li > < li >
2024-04-04 18:26:16 +00:00
< a href = "/graphrag/posts/index/verbs/layout_graph/" > layout_graph< / a >
2024-04-04 17:52:55 +00:00
< / li > < li >
2024-04-04 18:26:16 +00:00
< a href = "/graphrag/posts/index/verbs/merge/" > merge< / a >
2024-04-04 17:52:55 +00:00
< / li > < li >
2024-04-04 18:26:16 +00:00
< a href = "/graphrag/posts/index/verbs/merge_graphs/" > merge_graphs< / a >
< / li > < li >
< a href = "/graphrag/posts/index/verbs/spread_json/" > spread_json< / a >
2024-04-04 16:33:38 +00:00
< / li > < li >
2024-04-04 17:52:55 +00:00
< a href = "/graphrag/posts/index/verbs/text_replace/" > text_replace< / a >
< / li > < li >
2024-04-04 18:26:16 +00:00
< a href = "/graphrag/posts/index/verbs/text_split/" > text_split< / a >
2024-04-04 04:23:38 +00:00
< / li > < li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/index/verbs/unpack_graph/" > unpack_graph< / a >
2024-04-04 02:27:06 +00:00
< / li > < li >
2024-04-04 18:26:16 +00:00
< a href = "/graphrag/posts/index/verbs/unzip/" > unzip< / a >
2024-04-04 16:33:38 +00:00
< / li > < li >
2024-04-04 18:26:16 +00:00
< a href = "/graphrag/posts/index/verbs/zip/" > zip< / a >
2024-04-04 01:23:24 +00:00
< / li > < / ul >
< / li >
< li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/config/overview/" > Configuration< / a >
2024-04-04 01:23:24 +00:00
< ul >
< li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/config/env_vars" > Using Env Vars< / a >
2024-04-04 01:23:24 +00:00
< / li >
< li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/config/json_yaml" > Using JSON or YAML< / a >
2024-04-04 01:23:24 +00:00
< / li >
< li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/config/custom" > Fully Custom< / a >
2024-04-04 01:23:24 +00:00
< / li >
< / ul >
< / li >
< / ul >
< / li >
<!-- Query Links -->
< li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/query/overview/" > Query< / a >
2024-04-04 01:23:24 +00:00
< ul > < li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/query/0-global_search/" > Global Search< / a >
2024-04-04 01:23:24 +00:00
< / li > < li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/query/1-local_search/" > Local Search< / a >
2024-04-04 01:23:24 +00:00
< / li > < li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/query/2-question_generation/" > Question Generation< / a >
2024-04-04 01:23:24 +00:00
< / li > < li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/query/3-cli/" > CLI< / a >
2024-04-04 01:23:24 +00:00
< / li > < li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/query/notebooks/overview/" > Notebooks< / a >
2024-04-04 01:23:24 +00:00
< ul >
< li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/query/notebooks/global_search_nb" > Global Search< / a >
2024-04-04 01:23:24 +00:00
< / li >
< li >
2024-04-04 17:11:09 +00:00
< a href = "/graphrag/posts/query/notebooks/local_search_nb" > Local Search< / a >
2024-04-04 01:23:24 +00:00
< / li >
< / ul >
< / li >
< / ul >
< / li >
< / ul >
< / aside >
<!-- Main Content -->
< main >
< h1 > GraphRAG Indexing 🤖< / h1 >
< p > The GraphRAG indexing package is a data pipeline and transformation suite that is designed to extract meaningful, structured data from unstructured text using LLMs.< / p >
< p > Indexing Pipelines are configurable. They are composed of workflows, standard and custom steps, prompt templates, and input/output adapters. Our standard pipeline is designed to:< / p >
< ul >
< li > extract entities, relationships and claims from raw text< / li >
< li > perform community detection in entities< / li >
< li > generate community summaries and reports at multiple levels of granularity< / li >
< li > embed entities into a graph vector space< / li >
< li > embed text chunks into a textual vector space< / li >
< / ul >
< p > The outputs of the pipeline can be stored in a variety of formats, including JSON and Parquet - or they can be handled manually via the Python API.< / p >
< h2 > Getting Started< / h2 >
< h3 > Requirements< / h3 >
2024-04-04 17:11:09 +00:00
< p > See the < a href = "/graphrag/posts/developing#requirements" > requirements< / a > section in < a href = "/graphrag/posts/get_started" > Get Started< / a > for details on setting up a development environment.< / p >
2024-04-04 01:23:24 +00:00
< p > The Indexing Engine can be used in either a default configuration mode or with a custom pipeline.
2024-04-04 17:11:09 +00:00
To configure GraphRAG, see the < a href = "/graphrag/posts/config/overview" > configuration< / a > documentation.
2024-04-04 01:23:24 +00:00
After you have a config file you can run the pipeline using the CLI or the Python API.< / p >
< h2 > Usage< / h2 >
< h3 > CLI< / h3 >
< div style = "position: relative" >
< pre class = "language-bash" > < code id = "code-54" class = "language-bash" > < span class = "token comment" > # Via Poetry< / span >
poetry run poe cli < span class = "token parameter variable" > --root< / span > < span class = "token operator" > < < / span > data_root< span class = "token operator" > >< / span > < span class = "token comment" > # default config mode< / span >
poetry run poe cli < span class = "token parameter variable" > --config< / span > your_pipeline.yml < span class = "token comment" > # custom config mode< / span >
< span class = "token comment" > # Via Node< / span >
< span class = "token function" > yarn< / span > run:index < span class = "token parameter variable" > --root< / span > < span class = "token operator" > < < / span > data_root< span class = "token operator" > >< / span > < span class = "token comment" > # default config mode< / span >
< span class = "token function" > yarn< / span > run:index < span class = "token parameter variable" > --config< / span > your_pipeline.yml < span class = "token comment" > # custom config mode< / span >
< / code > < / pre >
2024-04-04 15:56:27 +00:00
< button class = "code-copy " data-clipboard-target = "#code-54" style = "position: absolute; top: 7.5px; right: 6px; padding-top: 3px; cursor: pointer; outline: none; opacity: 0.8;" title = "Copy" >
2024-04-04 01:23:24 +00:00
< span style = "display:inline-block;background:url(https://api.iconify.design/mdi/content-copy.svg) no-repeat center center / contain;width: 16px; height: 16px;" class = "" > < / span >
< / button >
< / div >
< h3 > Python API< / h3 >
< div style = "position: relative" >
< pre class = "language-python" > < code id = "code-58" class = "language-python" > < span class = "token keyword" > from< / span > graphrag< span class = "token punctuation" > .< / span > index < span class = "token keyword" > import< / span > run_pipeline
< span class = "token keyword" > from< / span > graphrag< span class = "token punctuation" > .< / span > index< span class = "token punctuation" > .< / span > config < span class = "token keyword" > import< / span > PipelineWorkflowReference
workflows< span class = "token punctuation" > :< / span > < span class = "token builtin" > list< / span > < span class = "token punctuation" > [< / span > PipelineWorkflowReference< span class = "token punctuation" > ]< / span > < span class = "token operator" > =< / span > < span class = "token punctuation" > [< / span >
PipelineWorkflowReference< span class = "token punctuation" > (< / span >
steps< span class = "token operator" > =< / span > < span class = "token punctuation" > [< / span >
< span class = "token punctuation" > {< / span >
< span class = "token comment" > # built-in verb< / span >
< span class = "token string" > "verb"< / span > < span class = "token punctuation" > :< / span > < span class = "token string" > "derive"< / span > < span class = "token punctuation" > ,< / span > < span class = "token comment" > # https://github.com/microsoft/datashaper/blob/main/python/datashaper/datashaper/engine/verbs/derive.py< / span >
< span class = "token string" > "args"< / span > < span class = "token punctuation" > :< / span > < span class = "token punctuation" > {< / span >
< span class = "token string" > "column1"< / span > < span class = "token punctuation" > :< / span > < span class = "token string" > "col1"< / span > < span class = "token punctuation" > ,< / span > < span class = "token comment" > # from above< / span >
< span class = "token string" > "column2"< / span > < span class = "token punctuation" > :< / span > < span class = "token string" > "col2"< / span > < span class = "token punctuation" > ,< / span > < span class = "token comment" > # from above< / span >
< span class = "token string" > "to"< / span > < span class = "token punctuation" > :< / span > < span class = "token string" > "col_multiplied"< / span > < span class = "token punctuation" > ,< / span > < span class = "token comment" > # new column name< / span >
< span class = "token string" > "operator"< / span > < span class = "token punctuation" > :< / span > < span class = "token string" > "*"< / span > < span class = "token punctuation" > ,< / span > < span class = "token comment" > # multiply the two columns< / span >
< span class = "token punctuation" > }< / span > < span class = "token punctuation" > ,< / span >
< span class = "token comment" > # Since we're trying to act on the default input, we don't need explicitly to specify an input< / span >
< span class = "token punctuation" > }< / span >
< span class = "token punctuation" > ]< / span >
< span class = "token punctuation" > )< / span > < span class = "token punctuation" > ,< / span >
< span class = "token punctuation" > ]< / span >
dataset < span class = "token operator" > =< / span > pd< span class = "token punctuation" > .< / span > DataFrame< span class = "token punctuation" > (< / span > < span class = "token punctuation" > [< / span > < span class = "token punctuation" > {< / span > < span class = "token string" > "col1"< / span > < span class = "token punctuation" > :< / span > < span class = "token number" > 2< / span > < span class = "token punctuation" > ,< / span > < span class = "token string" > "col2"< / span > < span class = "token punctuation" > :< / span > < span class = "token number" > 4< / span > < span class = "token punctuation" > }< / span > < span class = "token punctuation" > ,< / span > < span class = "token punctuation" > {< / span > < span class = "token string" > "col1"< / span > < span class = "token punctuation" > :< / span > < span class = "token number" > 5< / span > < span class = "token punctuation" > ,< / span > < span class = "token string" > "col2"< / span > < span class = "token punctuation" > :< / span > < span class = "token number" > 10< / span > < span class = "token punctuation" > }< / span > < span class = "token punctuation" > ]< / span > < span class = "token punctuation" > )< / span >
outputs < span class = "token operator" > =< / span > < span class = "token punctuation" > [< / span > < span class = "token punctuation" > ]< / span >
< span class = "token keyword" > async< / span > < span class = "token keyword" > for< / span > output < span class = "token keyword" > in< / span > < span class = "token keyword" > await< / span > run_pipeline< span class = "token punctuation" > (< / span > dataset< span class = "token operator" > =< / span > dataset< span class = "token punctuation" > ,< / span > workflows< span class = "token operator" > =< / span > workflows< span class = "token punctuation" > )< / span > < span class = "token punctuation" > :< / span >
outputs< span class = "token punctuation" > .< / span > append< span class = "token punctuation" > (< / span > output
pipeline_result < span class = "token operator" > =< / span > outputs< span class = "token punctuation" > [< / span > < span class = "token operator" > -< / span > < span class = "token number" > 1< / span > < span class = "token punctuation" > ]< / span >
< span class = "token keyword" > print< / span > < span class = "token punctuation" > (< / span > pipeline_result< span class = "token punctuation" > )< / span > < / code > < / pre >
2024-04-04 15:56:27 +00:00
< button class = "code-copy " data-clipboard-target = "#code-58" style = "position: absolute; top: 7.5px; right: 6px; padding-top: 3px; cursor: pointer; outline: none; opacity: 0.8;" title = "Copy" >
2024-04-04 01:23:24 +00:00
< span style = "display:inline-block;background:url(https://api.iconify.design/mdi/content-copy.svg) no-repeat center center / contain;width: 16px; height: 16px;" class = "" > < / span >
< / button >
< / div >
< h2 > Further Reading< / h2 >
< ul >
2024-04-04 17:11:09 +00:00
< li > To start developing within the < em > GraphRAG< / em > project, see < a href = "/graphrag/posts/developing/" > getting started< / a > < / li >
< li > To understand the underlying concepts and execution model of the indexing library, see < a href = "/graphrag/posts/index/0-architecture/" > the architecture documentation< / a > < / li >
2024-04-04 01:38:33 +00:00
< li > To get running with a series of examples, see < a href = "https://github.com/microsoft/graphrag/blob/main/examples/README.md" > the examples documentation< / a > < / li >
2024-04-04 17:11:09 +00:00
< li > To read more about configuring the indexing engine, see < a href = "/graphrag/posts/config/overview" > the configuration documentation< / a > < / li >
2024-04-04 01:23:24 +00:00
< / ul >
< / main >
< / div >
< footer >
< a href = "https://go.microsoft.com/fwlink/?LinkId=521839" > Privacy< / a >
|
< a href = "https://go.microsoft.com/fwlink/?LinkId=2259814" > Consumer Health Privacy< / a >
|
< span id = "cookiesManager" onClick = "manageConsent();" > Cookies< / span >
|
< a href = "https://go.microsoft.com/fwlink/?LinkID=206977" > Terms of Use< / a >
|
< a href = "https://www.microsoft.com/trademarks" > Trademarks< / a >
|
< a href = "https://www.microsoft.com" id = "copyright" > < / a >
|
< a href = "https://github.com/microsoft/graphrag" > GitHub< / a >
< / footer >
< / body >
< / html >