graphrag/examples/entity_extraction/with_graph_intelligence/run.py

# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
import asyncio
import os

from graphrag.index import run_pipeline, run_pipeline_with_config
from graphrag.index.config import PipelineCSVInputConfig, PipelineWorkflowReference
from graphrag.index.input import load_input

sample_data_dir = os.path.join(
    os.path.dirname(os.path.abspath(__file__)), "../../_sample_data/"
)

shared_dataset = asyncio.run(
    load_input(
        PipelineCSVInputConfig(
            file_pattern=".*\\.csv$",
            base_dir=sample_data_dir,
            source_column="author",
            text_column="message",
            timestamp_column="date(yyyyMMddHHmmss)",
            timestamp_format="%Y%m%d%H%M%S",
            title_column="message",
        ),
    )
)


async def run_with_config():
    """Run a pipeline with a config file"""
    # We're cheap, and this is an example, lets just do 10
    dataset = shared_dataset.head(10)

    # load pipeline.yml in this directory
    config_path = os.path.join(
        os.path.dirname(os.path.abspath(__file__)), "./pipeline.yml"
    )

    # Grab the last result from the pipeline, should be our entity extraction
    tables = []
    async for table in run_pipeline_with_config(
        config_or_path=config_path, dataset=dataset
    ):
        tables.append(table)
    pipeline_result = tables[-1]

    # Print the entities.  This will be a row for each text unit, each with a list of entities,
    # This should look pretty close to the python version, but since we're using an LLM
    # it will be a little different depending on how it feels about the text
    if pipeline_result.result is not None:
        print(pipeline_result.result["entities"].to_list())
    else:
        print("No results!")


async def run_python():
    if (
        "EXAMPLE_OPENAI_API_KEY" not in os.environ
        and "OPENAI_API_KEY" not in os.environ
    ):
        msg = "Please set EXAMPLE_OPENAI_API_KEY or OPENAI_API_KEY environment variable to run this example"
        raise Exception(msg)

    # We're cheap, and this is an example, lets just do 10
    dataset = shared_dataset.head(10)

    workflows: list[PipelineWorkflowReference] = [
        PipelineWorkflowReference(
            name="entity_extraction",
            config={
                "entity_extract": {
                    "strategy": {
                        "type": "graph_intelligence",
                        "llm": {
                            "type": "openai_chat",
                            "api_key": os.environ.get(
                                "EXAMPLE_OPENAI_API_KEY",
                                os.environ.get("OPENAI_API_KEY", None),
                            ),
                            "model": os.environ.get(
                                "EXAMPLE_OPENAI_MODEL", "gpt-3.5-turbo"
                            ),
                            "max_tokens": os.environ.get(
                                "EXAMPLE_OPENAI_MAX_TOKENS", 2500
                            ),
                            "temperature": os.environ.get(
                                "EXAMPLE_OPENAI_TEMPERATURE", 0
                            ),
                        },
                    }
                }
            },
        )
    ]

    # Grab the last result from the pipeline, should be our entity extraction
    tables = []
    async for table in run_pipeline(dataset=dataset, workflows=workflows):
        tables.append(table)
    pipeline_result = tables[-1]

    # Print the entities.  This will be a row for each text unit, each with a list of entities
    if pipeline_result.result is not None:
        print(pipeline_result.result["entities"].to_list())
    else:
        print("No results!")


if __name__ == "__main__":
    asyncio.run(run_python())
    asyncio.run(run_with_config())
Initial Release 2024-07-01 15:25:30 -06:00			`# Copyright (c) 2024 Microsoft Corporation.`
			`# Licensed under the MIT License`
			`import asyncio`
			`import os`

			`from graphrag.index import run_pipeline, run_pipeline_with_config`
			`from graphrag.index.config import PipelineCSVInputConfig, PipelineWorkflowReference`
			`from graphrag.index.input import load_input`

			`sample_data_dir = os.path.join(`
			`os.path.dirname(os.path.abspath(__file__)), "../../_sample_data/"`
			`)`

			`shared_dataset = asyncio.run(`
			`load_input(`
			`PipelineCSVInputConfig(`
			`file_pattern=".*\\.csv$",`
			`base_dir=sample_data_dir,`
			`source_column="author",`
			`text_column="message",`
			`timestamp_column="date(yyyyMMddHHmmss)",`
			`timestamp_format="%Y%m%d%H%M%S",`
			`title_column="message",`
			`),`
			`)`
			`)`


			`async def run_with_config():`
			`"""Run a pipeline with a config file"""`
			`# We're cheap, and this is an example, lets just do 10`
			`dataset = shared_dataset.head(10)`

			`# load pipeline.yml in this directory`
			`config_path = os.path.join(`
			`os.path.dirname(os.path.abspath(__file__)), "./pipeline.yml"`
			`)`

			`# Grab the last result from the pipeline, should be our entity extraction`
			`tables = []`
			`async for table in run_pipeline_with_config(`
			`config_or_path=config_path, dataset=dataset`
			`):`
			`tables.append(table)`
			`pipeline_result = tables[-1]`

			`# Print the entities. This will be a row for each text unit, each with a list of entities,`
			`# This should look pretty close to the python version, but since we're using an LLM`
			`# it will be a little different depending on how it feels about the text`
			`if pipeline_result.result is not None:`
			`print(pipeline_result.result["entities"].to_list())`
			`else:`
			`print("No results!")`


			`async def run_python():`
			`if (`
			`"EXAMPLE_OPENAI_API_KEY" not in os.environ`
			`and "OPENAI_API_KEY" not in os.environ`
			`):`
			`msg = "Please set EXAMPLE_OPENAI_API_KEY or OPENAI_API_KEY environment variable to run this example"`
			`raise Exception(msg)`

			`# We're cheap, and this is an example, lets just do 10`
			`dataset = shared_dataset.head(10)`

			`workflows: list[PipelineWorkflowReference] = [`
			`PipelineWorkflowReference(`
			`name="entity_extraction",`
			`config={`
			`"entity_extract": {`
			`"strategy": {`
			`"type": "graph_intelligence",`
			`"llm": {`
			`"type": "openai_chat",`
			`"api_key": os.environ.get(`
			`"EXAMPLE_OPENAI_API_KEY",`
			`os.environ.get("OPENAI_API_KEY", None),`
			`),`
			`"model": os.environ.get(`
			`"EXAMPLE_OPENAI_MODEL", "gpt-3.5-turbo"`
			`),`
			`"max_tokens": os.environ.get(`
			`"EXAMPLE_OPENAI_MAX_TOKENS", 2500`
			`),`
			`"temperature": os.environ.get(`
			`"EXAMPLE_OPENAI_TEMPERATURE", 0`
			`),`
			`},`
			`}`
			`}`
			`},`
			`)`
			`]`

			`# Grab the last result from the pipeline, should be our entity extraction`
			`tables = []`
			`async for table in run_pipeline(dataset=dataset, workflows=workflows):`
			`tables.append(table)`
			`pipeline_result = tables[-1]`

			`# Print the entities. This will be a row for each text unit, each with a list of entities`
			`if pipeline_result.result is not None:`
			`print(pipeline_result.result["entities"].to_list())`
			`else:`
			`print("No results!")`


			`if __name__ == "__main__":`
			`asyncio.run(run_python())`
			`asyncio.run(run_with_config())`