mirror of
https://github.com/microsoft/graphrag.git
synced 2025-07-08 09:31:54 +00:00

Correct links to verbs in comments Updated the links in comments to reflect new paths for 'derive' and 'aggregate' verbs. This improves documentation and ensures that references are up to date for future developers. Co-authored-by: Alonso Guevara <alonsog@microsoft.com>
78 lines
2.5 KiB
Python
78 lines
2.5 KiB
Python
# Copyright (c) 2024 Microsoft Corporation.
|
|
# Licensed under the MIT License
|
|
import asyncio
|
|
import os
|
|
|
|
import pandas as pd
|
|
|
|
from graphrag.index import run_pipeline, run_pipeline_with_config
|
|
from graphrag.index.config import PipelineWorkflowReference
|
|
|
|
# our fake dataset
|
|
dataset = pd.DataFrame([{"col1": 2, "col2": 4}, {"col1": 5, "col2": 10}])
|
|
|
|
|
|
async def run_with_config():
|
|
"""Run a pipeline with a config file"""
|
|
# load pipeline.yml in this directory
|
|
config_path = os.path.join(
|
|
os.path.dirname(os.path.abspath(__file__)), "./pipeline.yml"
|
|
)
|
|
|
|
tables = []
|
|
async for table in run_pipeline_with_config(
|
|
config_or_path=config_path, dataset=dataset
|
|
):
|
|
tables.append(table)
|
|
pipeline_result = tables[-1]
|
|
|
|
if pipeline_result.result is not None:
|
|
# Should look something like this, which should be identical to the python example:
|
|
# col1 col2 col_multiplied
|
|
# 0 2 4 8
|
|
# 1 5 10 50
|
|
print(pipeline_result.result)
|
|
else:
|
|
print("No results!")
|
|
|
|
|
|
async def run_python():
|
|
"""Run a pipeline using the python API"""
|
|
workflows: list[PipelineWorkflowReference] = [
|
|
PipelineWorkflowReference(
|
|
steps=[
|
|
{
|
|
# built-in verb
|
|
"verb": "derive", # https://github.com/microsoft/datashaper/blob/main/python/datashaper/datashaper/verbs/derive.py
|
|
"args": {
|
|
"column1": "col1", # from above
|
|
"column2": "col2", # from above
|
|
"to": "col_multiplied", # new column name
|
|
"operator": "*", # multiply the two columns
|
|
},
|
|
# Since we're trying to act on the default input, we don't need explicitly to specify an input
|
|
}
|
|
]
|
|
),
|
|
]
|
|
|
|
# Grab the last result from the pipeline, should be our entity extraction
|
|
tables = []
|
|
async for table in run_pipeline(dataset=dataset, workflows=workflows):
|
|
tables.append(table)
|
|
pipeline_result = tables[-1]
|
|
|
|
if pipeline_result.result is not None:
|
|
# Should look something like this:
|
|
# col1 col2 col_multiplied
|
|
# 0 2 4 8
|
|
# 1 5 10 50
|
|
print(pipeline_result.result)
|
|
else:
|
|
print("No results!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(run_with_config())
|
|
asyncio.run(run_python())
|