graphrag/graphrag/index/operations/extract_graph/graph_intelligence_strategy.py

100 lines
3.3 KiB
Python
Raw Normal View History

2024-07-01 15:25:30 -06:00
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
"""A module containing run_graph_intelligence, run_extract_graph and _create_text_splitter methods to run graph intelligence."""
2024-07-01 15:25:30 -06:00
import networkx as nx
from fnllm.types import ChatLLM
2024-07-01 15:25:30 -06:00
import graphrag.config.defaults as defs
from graphrag.cache.pipeline_cache import PipelineCache
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
Refactor config (#1593) * Refactor config - Add new ModelConfig to represent LLM settings - Combines LLMParameters, ParallelizationParameters, encoding_model, and async_mode - Add top level models config that is a list of available LLM ModelConfigs - Remove LLMConfig inheritance and delete LLMConfig - Replace the inheritance with a model_id reference to the ModelConfig listed in the top level models config - Remove all fallbacks and hydration logic from create_graphrag_config - This removes the automatic env variable overrides - Support env variables within config files using Templating - This requires "$" to be escaped with extra "$" so ".*\\.txt$" becomes ".*\\.txt$$" - Update init content to initialize new config file with the ModelConfig structure * Use dict of ModelConfig instead of list * Add model validations and unit tests * Fix ruff checks * Add semversioner change * Fix unit tests * validate root_dir in pydantic model * Rename ModelConfig to LanguageModelConfig * Rename ModelConfigMissingError to LanguageModelConfigMissingError * Add validationg for unexpected API keys * Allow skipping pydantic validation for testing/mocking purposes. * Add default lm configs to verb tests * smoke test * remove config from flows to fix llm arg mapping * Fix embedding llm arg mapping * Remove timestamp from smoke test outputs * Remove unused "subworkflows" smoke test properties * Add models to smoke test configs * Update smoke test output path * Send logs to logs folder * Fix output path * Fix csv test file pattern * Update placeholder * Format * Instantiate default model configs * Fix unit tests for config defaults * Fix migration notebook * Remove create_pipeline_config * Remove several unused config models * Remove indexing embedding and input configs * Move embeddings function to config * Remove skip_workflows * Remove skip embeddings in favor of explicit naming * fix unit test spelling mistake * self.models[model_id] is already a language model. Remove redundant casting. * update validation errors to instruct users to rerun graphrag init * instantiate LanguageModelConfigs with validation * skip validation in unit tests * update verb tests to use default model settings instead of skipping validation * test using llm settings * cleanup verb tests * remove unsafe default model config * remove the ability to skip pydantic validation * remove None union types when default values are set * move vector_store from embeddings to top level of config and delete resolve_paths * update vector store settings * fix vector store and smoke tests * fix serializing vector_store settings * fix vector_store usage * fix vector_store type * support cli overrides for loading graphrag config * rename storage to output * Add --force flag to init * Remove run_id and resume, fix Drift config assignment * Ruff --------- Co-authored-by: Nathan Evans <github@talkswithnumbers.com> Co-authored-by: Alonso Guevara <alonsog@microsoft.com>
2025-01-21 15:52:06 -08:00
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.index.llm.load_llm import load_llm
from graphrag.index.operations.extract_graph.graph_extractor import GraphExtractor
from graphrag.index.operations.extract_graph.typing import (
2024-07-01 15:25:30 -06:00
Document,
EntityExtractionResult,
EntityTypes,
StrategyConfig,
)
async def run_graph_intelligence(
2024-07-01 15:25:30 -06:00
docs: list[Document],
entity_types: EntityTypes,
callbacks: WorkflowCallbacks,
cache: PipelineCache,
2024-07-01 15:25:30 -06:00
args: StrategyConfig,
) -> EntityExtractionResult:
"""Run the graph intelligence entity extraction strategy."""
Refactor config (#1593) * Refactor config - Add new ModelConfig to represent LLM settings - Combines LLMParameters, ParallelizationParameters, encoding_model, and async_mode - Add top level models config that is a list of available LLM ModelConfigs - Remove LLMConfig inheritance and delete LLMConfig - Replace the inheritance with a model_id reference to the ModelConfig listed in the top level models config - Remove all fallbacks and hydration logic from create_graphrag_config - This removes the automatic env variable overrides - Support env variables within config files using Templating - This requires "$" to be escaped with extra "$" so ".*\\.txt$" becomes ".*\\.txt$$" - Update init content to initialize new config file with the ModelConfig structure * Use dict of ModelConfig instead of list * Add model validations and unit tests * Fix ruff checks * Add semversioner change * Fix unit tests * validate root_dir in pydantic model * Rename ModelConfig to LanguageModelConfig * Rename ModelConfigMissingError to LanguageModelConfigMissingError * Add validationg for unexpected API keys * Allow skipping pydantic validation for testing/mocking purposes. * Add default lm configs to verb tests * smoke test * remove config from flows to fix llm arg mapping * Fix embedding llm arg mapping * Remove timestamp from smoke test outputs * Remove unused "subworkflows" smoke test properties * Add models to smoke test configs * Update smoke test output path * Send logs to logs folder * Fix output path * Fix csv test file pattern * Update placeholder * Format * Instantiate default model configs * Fix unit tests for config defaults * Fix migration notebook * Remove create_pipeline_config * Remove several unused config models * Remove indexing embedding and input configs * Move embeddings function to config * Remove skip_workflows * Remove skip embeddings in favor of explicit naming * fix unit test spelling mistake * self.models[model_id] is already a language model. Remove redundant casting. * update validation errors to instruct users to rerun graphrag init * instantiate LanguageModelConfigs with validation * skip validation in unit tests * update verb tests to use default model settings instead of skipping validation * test using llm settings * cleanup verb tests * remove unsafe default model config * remove the ability to skip pydantic validation * remove None union types when default values are set * move vector_store from embeddings to top level of config and delete resolve_paths * update vector store settings * fix vector store and smoke tests * fix serializing vector_store settings * fix vector_store usage * fix vector_store type * support cli overrides for loading graphrag config * rename storage to output * Add --force flag to init * Remove run_id and resume, fix Drift config assignment * Ruff --------- Co-authored-by: Nathan Evans <github@talkswithnumbers.com> Co-authored-by: Alonso Guevara <alonsog@microsoft.com>
2025-01-21 15:52:06 -08:00
llm_config = LanguageModelConfig(**args["llm"])
llm = load_llm(
"extract_graph",
Refactor config (#1593) * Refactor config - Add new ModelConfig to represent LLM settings - Combines LLMParameters, ParallelizationParameters, encoding_model, and async_mode - Add top level models config that is a list of available LLM ModelConfigs - Remove LLMConfig inheritance and delete LLMConfig - Replace the inheritance with a model_id reference to the ModelConfig listed in the top level models config - Remove all fallbacks and hydration logic from create_graphrag_config - This removes the automatic env variable overrides - Support env variables within config files using Templating - This requires "$" to be escaped with extra "$" so ".*\\.txt$" becomes ".*\\.txt$$" - Update init content to initialize new config file with the ModelConfig structure * Use dict of ModelConfig instead of list * Add model validations and unit tests * Fix ruff checks * Add semversioner change * Fix unit tests * validate root_dir in pydantic model * Rename ModelConfig to LanguageModelConfig * Rename ModelConfigMissingError to LanguageModelConfigMissingError * Add validationg for unexpected API keys * Allow skipping pydantic validation for testing/mocking purposes. * Add default lm configs to verb tests * smoke test * remove config from flows to fix llm arg mapping * Fix embedding llm arg mapping * Remove timestamp from smoke test outputs * Remove unused "subworkflows" smoke test properties * Add models to smoke test configs * Update smoke test output path * Send logs to logs folder * Fix output path * Fix csv test file pattern * Update placeholder * Format * Instantiate default model configs * Fix unit tests for config defaults * Fix migration notebook * Remove create_pipeline_config * Remove several unused config models * Remove indexing embedding and input configs * Move embeddings function to config * Remove skip_workflows * Remove skip embeddings in favor of explicit naming * fix unit test spelling mistake * self.models[model_id] is already a language model. Remove redundant casting. * update validation errors to instruct users to rerun graphrag init * instantiate LanguageModelConfigs with validation * skip validation in unit tests * update verb tests to use default model settings instead of skipping validation * test using llm settings * cleanup verb tests * remove unsafe default model config * remove the ability to skip pydantic validation * remove None union types when default values are set * move vector_store from embeddings to top level of config and delete resolve_paths * update vector store settings * fix vector store and smoke tests * fix serializing vector_store settings * fix vector_store usage * fix vector_store type * support cli overrides for loading graphrag config * rename storage to output * Add --force flag to init * Remove run_id and resume, fix Drift config assignment * Ruff --------- Co-authored-by: Nathan Evans <github@talkswithnumbers.com> Co-authored-by: Alonso Guevara <alonsog@microsoft.com>
2025-01-21 15:52:06 -08:00
llm_config,
callbacks=callbacks,
cache=cache,
)
return await run_extract_graph(llm, docs, entity_types, callbacks, args)
2024-07-01 15:25:30 -06:00
async def run_extract_graph(
Remove graphrag.llm, replace with fnllm (#1315) * add fnllm; remove llm folder * remove llm unit tests * update imports * update imports * formatting * enable autosave * update mockllm * update community reports extractor * move most llm usage to fnllm * update type issues * fix unit tests * type updates * update dictionary * semver * update llm construction, get integration tests working * load from llmparameters model * move ruff settings to ruff.toml * add gitattributes file * ignore ruff.toml spelling * update .gitattributes * update gitignore * update config construction * update prompt var usage * add cache adapter * use cache adapter in embeddings calls * update embedding strategy * add fnllm * add pytest-dotenv * fix some verb tests * get verbtests running * update ruff.toml for vscode * enable ruff native server in vscode * update artifact inspecting code * remove local-test update * use string.replace instead of string.format in community reprots etxractor * bump timeout * revert ruff.toml, vscode settings for another pr * revert cspell config * revert gitignore * remove json-repair, update fnllm * use fnllm generic type interfaces * update load_llm to use target models * consolidate chat parameters * add 'extra_attributes' prop to community report response * formatting * update fnllm * formatting * formatting * Add defaults to some llm params to avoid null on params hash * Formatting --------- Co-authored-by: Alonso Guevara <alonsog@microsoft.com> Co-authored-by: Josh Bradley <joshbradley@microsoft.com>
2024-12-05 16:07:47 -08:00
llm: ChatLLM,
2024-07-01 15:25:30 -06:00
docs: list[Document],
entity_types: EntityTypes,
callbacks: WorkflowCallbacks | None,
2024-07-01 15:25:30 -06:00
args: StrategyConfig,
) -> EntityExtractionResult:
"""Run the entity extraction chain."""
tuple_delimiter = args.get("tuple_delimiter", None)
record_delimiter = args.get("record_delimiter", None)
completion_delimiter = args.get("completion_delimiter", None)
extraction_prompt = args.get("extraction_prompt", None)
encoding_model = args.get("encoding_name", None)
max_gleanings = args.get("max_gleanings", defs.EXTRACT_GRAPH_MAX_GLEANINGS)
2024-07-01 15:25:30 -06:00
extractor = GraphExtractor(
llm_invoker=llm,
prompt=extraction_prompt,
encoding_model=encoding_model,
max_gleanings=max_gleanings,
on_error=lambda e, s, d: (
callbacks.error("Entity Extraction Error", e, s, d) if callbacks else None
2024-07-01 15:25:30 -06:00
),
)
text_list = [doc.text.strip() for doc in docs]
results = await extractor(
list(text_list),
{
"entity_types": entity_types,
"tuple_delimiter": tuple_delimiter,
"record_delimiter": record_delimiter,
"completion_delimiter": completion_delimiter,
},
)
graph = results.output
# Map the "source_id" back to the "id" field
for _, node in graph.nodes(data=True): # type: ignore
if node is not None:
node["source_id"] = ",".join(
docs[int(id)].id for id in node["source_id"].split(",")
)
for _, _, edge in graph.edges(data=True): # type: ignore
if edge is not None:
edge["source_id"] = ",".join(
docs[int(id)].id for id in edge["source_id"].split(",")
)
entities = [
({"title": item[0], **(item[1] or {})})
2024-07-01 15:25:30 -06:00
for item in graph.nodes(data=True)
if item is not None
]
relationships = nx.to_pandas_edgelist(graph)
return EntityExtractionResult(entities, relationships, graph)