graphrag/examples/single_verb/run.py

# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
import asyncio
import os

import pandas as pd

from graphrag.index import run_pipeline, run_pipeline_with_config
from graphrag.index.config import PipelineWorkflowReference

# our fake dataset
dataset = pd.DataFrame([{"col1": 2, "col2": 4}, {"col1": 5, "col2": 10}])


async def run_with_config():
    """Run a pipeline with a config file"""
    # load pipeline.yml in this directory
    config_path = os.path.join(
        os.path.dirname(os.path.abspath(__file__)), "./pipeline.yml"
    )

    tables = []
    async for table in run_pipeline_with_config(
        config_or_path=config_path, dataset=dataset
    ):
        tables.append(table)
    pipeline_result = tables[-1]

    if pipeline_result.result is not None:
        # Should look something like this, which should be identical to the python example:
        #    col1  col2  col_multiplied
        # 0     2     4               8
        # 1     5    10              50
        print(pipeline_result.result)
    else:
        print("No results!")


async def run_python():
    """Run a pipeline using the python API"""
    workflows: list[PipelineWorkflowReference] = [
        PipelineWorkflowReference(
            steps=[
                {
                    # built-in verb
                    "verb": "derive",  # https://github.com/microsoft/datashaper/blob/main/python/datashaper/datashaper/verbs/derive.py
                    "args": {
                        "column1": "col1",  # from above
                        "column2": "col2",  # from above
                        "to": "col_multiplied",  # new column name
                        "operator": "*",  # multiply the two columns
                    },
                    # Since we're trying to act on the default input, we don't need explicitly to specify an input
                }
            ]
        ),
    ]

    # Grab the last result from the pipeline, should be our entity extraction
    tables = []
    async for table in run_pipeline(dataset=dataset, workflows=workflows):
        tables.append(table)
    pipeline_result = tables[-1]

    if pipeline_result.result is not None:
        # Should look something like this:
        #    col1  col2  col_multiplied
        # 0     2     4               8
        # 1     5    10              50
        print(pipeline_result.result)
    else:
        print("No results!")


if __name__ == "__main__":
    asyncio.run(run_with_config())
    asyncio.run(run_python())
Initial Release 2024-07-01 15:25:30 -06:00			`# Copyright (c) 2024 Microsoft Corporation.`
			`# Licensed under the MIT License`
			`import asyncio`
			`import os`

			`import pandas as pd`

			`from graphrag.index import run_pipeline, run_pipeline_with_config`
			`from graphrag.index.config import PipelineWorkflowReference`

			`# our fake dataset`
			`dataset = pd.DataFrame([{"col1": 2, "col2": 4}, {"col1": 5, "col2": 10}])`


			`async def run_with_config():`
			`"""Run a pipeline with a config file"""`
			`# load pipeline.yml in this directory`
			`config_path = os.path.join(`
			`os.path.dirname(os.path.abspath(__file__)), "./pipeline.yml"`
			`)`

			`tables = []`
			`async for table in run_pipeline_with_config(`
			`config_or_path=config_path, dataset=dataset`
			`):`
			`tables.append(table)`
			`pipeline_result = tables[-1]`

			`if pipeline_result.result is not None:`
			`# Should look something like this, which should be identical to the python example:`
			`# col1 col2 col_multiplied`
			`# 0 2 4 8`
			`# 1 5 10 50`
			`print(pipeline_result.result)`
			`else:`
			`print("No results!")`


			`async def run_python():`
			`"""Run a pipeline using the python API"""`
			`workflows: list[PipelineWorkflowReference] = [`
			`PipelineWorkflowReference(`
			`steps=[`
			`{`
			`# built-in verb`
Correct links to datashaper verbs in comments (#1068) Correct links to verbs in comments Updated the links in comments to reflect new paths for 'derive' and 'aggregate' verbs. This improves documentation and ensures that references are up to date for future developers. Co-authored-by: Alonso Guevara <alonsog@microsoft.com> 2024-09-13 03:44:38 +09:00			`"verb": "derive", # https://github.com/microsoft/datashaper/blob/main/python/datashaper/datashaper/verbs/derive.py`
Initial Release 2024-07-01 15:25:30 -06:00			`"args": {`
			`"column1": "col1", # from above`
			`"column2": "col2", # from above`
			`"to": "col_multiplied", # new column name`
			`"operator": "*", # multiply the two columns`
			`},`
			`# Since we're trying to act on the default input, we don't need explicitly to specify an input`
			`}`
			`]`
			`),`
			`]`

			`# Grab the last result from the pipeline, should be our entity extraction`
			`tables = []`
			`async for table in run_pipeline(dataset=dataset, workflows=workflows):`
			`tables.append(table)`
			`pipeline_result = tables[-1]`

			`if pipeline_result.result is not None:`
			`# Should look something like this:`
			`# col1 col2 col_multiplied`
			`# 0 2 4 8`
			`# 1 5 10 50`
			`print(pipeline_result.result)`
			`else:`
			`print("No results!")`


			`if __name__ == "__main__":`
			`asyncio.run(run_with_config())`
			`asyncio.run(run_python())`