graphrag/index.html

381 lines
14 KiB
HTML
Raw Normal View History

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Welcome to GraphRAG</title>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.4/css/bulma.min.css">
<link href="https://unpkg.com/prismjs@1.20.0/themes/prism-okaidia.css" rel="stylesheet">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/Primer/19.1.1/tooltips.min.css" crossorigin="anonymous" referrerpolicy="no-referrer">
<style>
html {
padding: 0;
margin: 0;
}
body{
font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
padding: 0;
margin: 0;
}
footer{
width: 100%;
height: 32px;
font-size: 12px;
display: flex;
flex-direction: row;
justify-content: center;
gap: 18px;
align-items: center;
color: #5d5d5d;
background: #e9eaeb;
border-top: 1px solid #c4c5c6;
}
#cookiesManager{
cursor: pointer;
color: #485fc7;
}
.page-content {
display: flex;
flex-direction: row;
margin: 0;
padding: 0;
overflow: scroll;
padding: 0;
margin: 0;
}
header {
background-color: lightgrey;
height: 2%;
padding: 10px;
}
nav {
padding: 1em;
min-width: 200px;
}
main {
flex: 1;
padding: 0 5em 0 5em;
}
.logotitle {
font-size: 1.5em;
font-weight: bold;
margin: 5px;
}
.number {
all: unset;
}
.tag.token {
all: unset;
}
main ul {
list-style-type: disc;
padding-left: 30px;
margin-top: 10px;
}
h1 {
font-size: 2rem;
margin-top: 10px;
}
h2 {
font-size: 1.5rem;
margin-top: 10px;
font-weight: 500;
}
h3 {
font-size: 1rem;
margin-top: 10px;
font-weight: 500;
}
p {
margin-top: 10px;
}
/* Accessibility styling */
a {
color: #485fc7;
text-decoration: underline;
}
.menu-list a {
text-decoration: none;
}
.token.comment, .token.prolog, .token.doctype, .token.cdata {
color: #8093a5;
}
.token.property, .token.tag, .token.constant, .token.symbol, .token.deleted {
color: #ff36ab;
}
</style>
<script type="module" async="">import mermaid from "https://unpkg.com/mermaid@10/dist/mermaid.esm.min.mjs";document.addEventListener('DOMContentLoaded', mermaid.initialize({"loadOnSave":true}));</script>
<script>function showTooltip(o,e){o.trigger.className.includes("tooltipped")||(o.trigger.children[0].className="tooltipped tooltipped-s",o.trigger.children[0].ariaLabel=e)}window.addEventListener("load",()=>{var o=new ClipboardJS(".code-copy");o.on("success",o=>showTooltip(o,"Copied!")),o.on("error",o=>showTooltip(o,"Failed..."))});</script>
<script async="" src="https://cdn.jsdelivr.net/npm/clipboard@2.0.11/dist/clipboard.min.js"></script>
<script src="https://wcpstatic.microsoft.com/mscc/lib/v2/wcp-consent.js" type="text/javascript"></script>
<script>
function onConsentChanged(categoryPreferences) {
console.log("onConsentChanged", categoryPreferences);
}
var siteConsent
function initialize(){
var currentYear = new Date().getFullYear()
document.getElementById("copyright").innerHTML = `©️ ${currentYear} Microsoft`;
window.WcpConsent && WcpConsent.init("en-US", "cookie-banner", function (err, _siteConsent) {
if (!err) {
siteConsent = _siteConsent; //siteConsent is used to get the current consent
} else {
console.log("Error initializing WcpConsent: "+ err);
}
}, onConsentChanged, WcpConsent.themes.light);
}
addEventListener("DOMContentLoaded", initialize)
function manageConsent() {
if(siteConsent.isConsentRequired){
siteConsent.manageConsent();
}
}
</script>
</head>
<body>
<header>
<div id="cookie-banner"></div>
<a href="/graphrag/"><span class="logotitle">GraphRAG</span></a>
</header>
<div class="page-content">
<!-- Sidebar -->
<aside class="menu">
<ul class="menu-list">
<li>
<a href="/graphrag/" class="is-active" aria-current="page">Welcome</a>
</li>
<!-- Get Started Links -->
<li>
<a href="/graphrag/posts/get_started/">Get Started</a>
<a href="/graphrag/posts/developing/">Developing</a>
</li>
<!-- Indexing Links -->
<li>
<a href="/graphrag/posts/index/overview/">Indexing</a>
<ul><li>
<a href="/graphrag/posts/index/0-architecture/">Architecture</a>
</li><li>
<a href="/graphrag/posts/index/1-default_dataflow/">Dataflow</a>
</li><li>
<a href="/graphrag/posts/index/2-cli/">CLI</a>
</li><li>
<a href="/graphrag/posts/index/3-prompt_tuning/">Prompt Tuning</a>
</li><li>
<a href="/graphrag/posts/index/workflows/overview/">Workflows</a>
<ul hidden=""><li>
<a href="/graphrag/posts/index/workflows/create_base_documents/">create_base_documents</a>
</li><li>
<a href="/graphrag/posts/index/workflows/create_base_entity_graph/">create_base_entity_graph</a>
</li><li>
<a href="/graphrag/posts/index/workflows/create_base_extracted_entities/">create_base_extracted_entities</a>
</li><li>
<a href="/graphrag/posts/index/workflows/create_base_text_units/">create_base_text_units</a>
</li><li>
<a href="/graphrag/posts/index/workflows/create_final_communities/">create_final_communities</a>
</li><li>
<a href="/graphrag/posts/index/workflows/create_final_community_reports/">create_final_community_reports</a>
</li><li>
<a href="/graphrag/posts/index/workflows/create_final_covariates/">create_final_covariates</a>
</li><li>
<a href="/graphrag/posts/index/workflows/create_final_documents/">create_final_documents</a>
</li><li>
<a href="/graphrag/posts/index/workflows/create_final_entities/">create_final_entities</a>
</li><li>
<a href="/graphrag/posts/index/workflows/create_final_nodes/">create_final_nodes</a>
</li><li>
<a href="/graphrag/posts/index/workflows/create_final_relationships/">create_final_relationships</a>
</li><li>
<a href="/graphrag/posts/index/workflows/create_final_text_units/">create_final_text_units</a>
</li><li>
<a href="/graphrag/posts/index/workflows/create_summarized_entities/">create_summarized_entities</a>
</li></ul>
</li>
<li>
<a href="/graphrag/posts/index/verbs/overview/">Verbs</a>
<ul hidden=""><li>
<a href="/graphrag/posts/index/verbs/aggregate/">aggregate</a>
</li><li>
<a href="/graphrag/posts/index/verbs/chunk/">chunk</a>
</li><li>
<a href="/graphrag/posts/index/verbs/cluster_graph/">cluster_graph</a>
</li><li>
<a href="/graphrag/posts/index/verbs/concat/">concat</a>
</li><li>
<a href="/graphrag/posts/index/verbs/create_graph/">create_graph</a>
</li><li>
<a href="/graphrag/posts/index/verbs/genid/">genid</a>
</li><li>
<a href="/graphrag/posts/index/verbs/layout_graph/">layout_graph</a>
</li><li>
<a href="/graphrag/posts/index/verbs/merge/">merge</a>
</li><li>
<a href="/graphrag/posts/index/verbs/merge_graphs/">merge_graphs</a>
</li><li>
<a href="/graphrag/posts/index/verbs/spread_json/">spread_json</a>
</li><li>
<a href="/graphrag/posts/index/verbs/text_replace/">text_replace</a>
</li><li>
<a href="/graphrag/posts/index/verbs/text_split/">text_split</a>
</li><li>
<a href="/graphrag/posts/index/verbs/unpack_graph/">unpack_graph</a>
</li><li>
<a href="/graphrag/posts/index/verbs/unzip/">unzip</a>
</li><li>
<a href="/graphrag/posts/index/verbs/zip/">zip</a>
</li></ul>
</li>
<li>
<a href="/graphrag/posts/config/overview/">Configuration</a>
<ul>
<li>
<a href="/graphrag/posts/config/env_vars">Using Env Vars</a>
</li>
<li>
<a href="/graphrag/posts/config/json_yaml">Using JSON or YAML</a>
</li>
<li>
<a href="/graphrag/posts/config/custom">Fully Custom</a>
</li>
</ul>
</li>
</ul>
</li>
<!-- Query Links -->
<li>
<a href="/graphrag/posts/query/overview/">Query</a>
<ul><li>
<a href="/graphrag/posts/query/0-global_search/">Global Search</a>
</li><li>
<a href="/graphrag/posts/query/1-local_search/">Local Search</a>
</li><li>
<a href="/graphrag/posts/query/2-question_generation/">Question Generation</a>
</li><li>
<a href="/graphrag/posts/query/3-cli/">CLI</a>
</li><li>
<a href="/graphrag/posts/query/notebooks/overview/">Notebooks</a>
<ul>
<li>
<a href="/graphrag/posts/query/notebooks/global_search_nb">Global Search</a>
</li>
<li>
<a href="/graphrag/posts/query/notebooks/local_search_nb">Local Search</a>
</li>
</ul>
</li>
</ul>
</li>
</ul>
</aside>
<!-- Main Content -->
<main>
<h1>Welcome to GraphRAG</h1>
<p>👉 <a href="https://github.com/microsoft/graphrag">GitHub Repository</a></p>
<p align="center">
<img src="img/GraphRag-Figure1.jpg" alt="Figure 1: LLM-generated knowledge graph built from a private dataset using GPT-4 Turbo." width="450" align="center">
</p>
<p align="center">
Figure 1: An LLM-generated knowledge graph built using GPT-4 Turbo.
</p>
<p>GraphRAG is a structured, hierarchical approach to Retrieval Augmented Generation (RAG), as opposed to naive semantic-search
approaches using plain text snippets. The GraphRAG process involves extracting a knowledge graph out of raw text, building a community hierarchy, generating summaries for these communities, and then leveraging these structures when perform RAG-based tasks.</p>
<h2>Get Started 🚀</h2>
<p>To start using GraphRAG, check out the <a href="posts/get_started"><em>Get Started</em></a> guide.
For a deeper dive into the main sub-systems, please visit the docpages for the <a href="posts/index/overview">Indexer</a> and <a href="posts/query/overview">Query</a> packages.</p>
<h2>GraphRAG vs Baseline RAG 🔍</h2>
<p>Retrieval-Augmented Generation (RAG) is a technique to improve LLM outputs using real-world information. This technique is an important part of most LLM-based tools and the majority of RAG approaches use vector similarity as the search technique, which we call <em>Baseline RAG</em>. GraphRAG uses knowledge graphs to provide substantial improvements in question-and-answer performance when reasoning about complex information. RAG techniques have shown promise in helping LLMs to reason about <em>private datasets</em> - data that the LLM is not trained on and has never seen before, such as an enterprises proprietary research, business documents, or communications. <em>Baseline RAG</em> was created to help solve this problem, but we observe situations where baseline RAG performs very poorly. For example:</p>
<ul>
<li>Baseline RAG struggles to connect the dots. This happens when answering a question requires traversing disparate pieces of information through their shared attributes in order to provide new synthesized insights.</li>
<li>Baseline RAG performs poorly when being asked to holistically understand summarized semantic concepts over large data collections or even singular large documents.</li>
</ul>
<p>To address this, the tech community is working to develop methods that extend and enhance RAG. Microsoft Researchs new approach, GraphRAG, uses LLMs to create a knowledge graph based on an input corpus. This graph, along with community summaries and graph machine learning outputs, are used to augment prompts at query time. GraphRAG shows substantial improvement in answering the two classes of questions described above, demonstrating intelligence or mastery that outperforms other approaches previously applied to private datasets.</p>
<h2>The GraphRAG Process 🤖</h2>
<p>GraphRAG builds upon our prior <a href="https://www.microsoft.com/en-us/worklab/patterns-hidden-inside-the-org-chart">research</a> and <a href="https://github.com/microsoft/graspologic">tooling</a> using graph machine learning. The basic steps of the GraphRAG process are as follows:</p>
<h3>Index</h3>
<ul>
<li>Slice up an input corpus into a series of TextUnits, which act as analyzable units for the rest of the process, and provide fine-grained references ino our outputs.</li>
<li>Extract all entities, relationships, and key claims from the TextUnits using an LLM.</li>
<li>Perform a hierarchical clustering of the graph using the <a href="https://arxiv.org/pdf/1810.08473.pdf">Leiden technique</a>. To see this visually, check out Figure 1 above. Each circle is an entity (e.g., a person, place, or organization), with the size representing the degree of the entity, and the color representing its community.</li>
<li>Generate summaries of each community and its constituents from the bottom-up. This aids in holistic understanding of the dataset.</li>
</ul>
<h3>Query</h3>
<p>At query time, these structures are used to provide materials for the LLM context window when answering a question. The primary query modes are:</p>
<ul>
<li><a href="posts/query/0-global_search"><em>Global Search</em></a> for reasoning about holistic questions about the corpus by leveraging the community summaries.</li>
<li><a href="posts/query/1-local_search"><em>Local Search</em></a> for reasoning about specific entities by fanning-out to their neighbors and associated concepts.</li>
</ul>
</main>
</div>
<footer>
<a href="https://go.microsoft.com/fwlink/?LinkId=521839">Privacy</a>
|
<a href="https://go.microsoft.com/fwlink/?LinkId=2259814">Consumer Health Privacy</a>
|
<span id="cookiesManager" onClick="manageConsent();">Cookies</span>
|
<a href="https://go.microsoft.com/fwlink/?LinkID=206977">Terms of Use</a>
|
<a href="https://www.microsoft.com/trademarks">Trademarks</a>
|
<a href="https://www.microsoft.com" id="copyright"></a>
|
<a href="https://github.com/microsoft/graphrag">GitHub</a>
</footer>
</body>
</html>