graphrag/unified-search-app/app/knowledge_loader/data_prep.py

# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""Data prep module."""

import logging

import data_config as config
import pandas as pd
import streamlit as st
from knowledge_loader.data_sources.typing import Datasource

"""
Contains functions to load and prep graph-indexed data from parquet files into dataframes.
These output dataframes will then be used to create knowledge model's objects to be used as inputs for the graphrag-orchestration functions
"""
logging.basicConfig(level=logging.INFO)
logging.getLogger("azure").setLevel(logging.WARNING)
logger = logging.getLogger(__name__)


@st.cache_data(ttl=config.default_ttl)
def get_entity_data(dataset: str, _datasource: Datasource) -> pd.DataFrame:
    """Return a dataframe with entity data from the indexed-data."""
    entity_details_df = _datasource.read(config.entity_table)

    print(f"Entity records: {len(entity_details_df)}")  # noqa T201
    print(f"Dataset: {dataset}")  # noqa T201
    return entity_details_df


@st.cache_data(ttl=config.default_ttl)
def get_relationship_data(dataset: str, _datasource: Datasource) -> pd.DataFrame:
    """Return a dataframe with entity-entity relationship data from the indexed-data."""
    relationship_df = _datasource.read(config.relationship_table)
    print(f"Relationship records: {len(relationship_df)}")  # noqa T201
    print(f"Dataset: {dataset}")  # noqa T201
    return relationship_df


@st.cache_data(ttl=config.default_ttl)
def get_covariate_data(dataset: str, _datasource: Datasource) -> pd.DataFrame:
    """Return a dataframe with covariate data from the indexed-data."""
    covariate_df = _datasource.read(config.covariate_table)
    print(f"Covariate records: {len(covariate_df)}")  # noqa T201
    print(f"Dataset: {dataset}")  # noqa T201
    return covariate_df


@st.cache_data(ttl=config.default_ttl)
def get_text_unit_data(dataset: str, _datasource: Datasource) -> pd.DataFrame:
    """Return a dataframe with text units (i.e. chunks of text from the raw documents) from the indexed-data."""
    text_unit_df = _datasource.read(config.text_unit_table)
    print(f"Text unit records: {len(text_unit_df)}")  # noqa T201
    print(f"Dataset: {dataset}")  # noqa T201
    return text_unit_df


@st.cache_data(ttl=config.default_ttl)
def get_community_report_data(
    _datasource: Datasource,
) -> pd.DataFrame:
    """Return a dataframe with community report data from the indexed-data."""
    report_df = _datasource.read(config.community_report_table)
    print(f"Report records: {len(report_df)}")  # noqa T201

    return report_df


@st.cache_data(ttl=config.default_ttl)
def get_communities_data(
    _datasource: Datasource,
) -> pd.DataFrame:
    """Return a dataframe with communities data from the indexed-data."""
    return _datasource.read(config.communities_table)