234 lines
8.0 KiB
Python
Raw Normal View History

2024-06-26 15:45:06 -04:00
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import hashlib
2024-12-30 01:59:08 -05:00
import os
2024-06-26 15:45:06 -04:00
import re
import pandas as pd
2025-01-28 00:34:04 -05:00
from azure.core.exceptions import ResourceNotFoundError
2024-12-30 01:59:08 -05:00
from azure.cosmos import exceptions
from azure.identity import DefaultAzureCredential
2024-09-12 21:41:46 -04:00
from fastapi import HTTPException
2024-06-26 15:45:06 -04:00
2025-01-28 00:34:04 -05:00
from graphrag_app.logger.load_logger import load_pipeline_logger
2025-01-25 04:07:53 -05:00
from graphrag_app.utils.azure_clients import AzureClientManager
2024-06-26 15:45:06 -04:00
2024-12-30 01:59:08 -05:00
def get_df(
table_path: str,
) -> pd.DataFrame:
df = pd.read_parquet(
table_path,
storage_options=pandas_storage_options(),
)
return df
def pandas_storage_options() -> dict:
2024-12-30 01:59:08 -05:00
"""Generate the storage options required by pandas to read parquet files from Storage."""
# For more information on the options available, see: https://github.com/fsspec/adlfs?tab=readme-ov-file#setting-credentials
azure_client_manager = AzureClientManager()
options = {
"account_name": azure_client_manager.storage_account_name,
"account_host": azure_client_manager.storage_account_hostname,
}
if os.getenv("STORAGE_CONNECTION_STRING"):
options["connection_string"] = os.getenv("STORAGE_CONNECTION_STRING")
else:
options["credential"] = DefaultAzureCredential()
return options
2024-06-26 15:45:06 -04:00
def delete_blob_container(container_name: str):
"""
Delete a blob container. If it does not exist, do nothing.
If exception is raised, the calling function should catch it.
"""
2024-12-30 01:59:08 -05:00
azure_client_manager = AzureClientManager()
blob_service_client = azure_client_manager.get_blob_service_client()
2025-01-28 00:34:04 -05:00
try:
2024-06-26 15:45:06 -04:00
blob_service_client.delete_container(container_name)
2025-01-28 00:34:04 -05:00
except ResourceNotFoundError:
# do nothing if container does not exist
pass
2024-06-26 15:45:06 -04:00
2024-12-30 01:59:08 -05:00
def delete_cosmos_container_item(container: str, item_id: str):
"""
Delete an item from a cosmosdb container. If it does not exist, do nothing.
If exception is raised, the calling function should catch it.
"""
azure_client_manager = AzureClientManager()
try:
azure_client_manager.get_cosmos_container_client(
database="graphrag", container=container
).delete_item(item_id, item_id)
2025-01-28 00:34:04 -05:00
except ResourceNotFoundError:
# do nothing if item does not exist
2024-12-30 01:59:08 -05:00
pass
2025-01-28 00:34:04 -05:00
def validate_index_file_exist(sanitized_container_name: str, file_name: str):
2024-06-26 15:45:06 -04:00
"""
Check if index exists and that the specified blob file exists.
A "valid" index is defined by having an entry in the container-store table in cosmos db.
Further checks are done to ensure the blob container and file exist.
Args:
-----
2025-01-28 00:34:04 -05:00
sanitized_container_name (str)
Sanitized name of a blob container.
2024-06-26 15:45:06 -04:00
file_name (str)
The blob file to be validated.
Raises: ValueError
"""
2024-12-30 01:59:08 -05:00
azure_client_manager = AzureClientManager()
2024-06-26 15:45:06 -04:00
try:
2025-01-28 00:34:04 -05:00
cosmos_container_client = get_cosmos_container_store_client()
cosmos_container_client.read_item(
sanitized_container_name, sanitized_container_name
2024-06-26 15:45:06 -04:00
)
except Exception:
2025-01-28 00:34:04 -05:00
original_container_name = desanitize_name(sanitized_container_name)
raise ValueError(f"{original_container_name} is not a valid index.")
2024-06-26 15:45:06 -04:00
# check for file existence
2024-12-30 01:59:08 -05:00
index_container_client = (
2025-01-28 00:34:04 -05:00
azure_client_manager.get_blob_service_client().get_container_client(
sanitized_container_name
)
2024-12-30 01:59:08 -05:00
)
2024-06-26 15:45:06 -04:00
if not index_container_client.exists():
2025-01-28 00:34:04 -05:00
raise ValueError(f"{original_container_name} not found.")
2024-06-26 15:45:06 -04:00
if not index_container_client.get_blob_client(file_name).exists():
2025-01-28 00:34:04 -05:00
raise ValueError(
f"File {file_name} unavailable for container {original_container_name}."
)
2024-06-26 15:45:06 -04:00
def validate_blob_container_name(container_name: str):
"""
Check if container name is valid based on Azure resource naming rules.
- A blob container name must be between 3 and 63 characters in length.
- Start with a letter or number
- All letters used in blob container names must be lowercase.
- Contain only letters, numbers, or the hyphen.
- Consecutive hyphens are not permitted.
- Cannot end with a hyphen.
Args:
-----
container_name (str)
The blob container name to be validated.
Raises: ValueError
"""
# Check the length of the name
if len(container_name) < 3 or len(container_name) > 63:
raise ValueError(
f"Container name must be between 3 and 63 characters in length. Name provided was {len(container_name)} characters long."
)
# Check if the name starts with a letter or number
if not container_name[0].isalnum():
raise ValueError(
f"Container name must start with a letter or number. Starting character was {container_name[0]}."
)
# Check for valid characters (letters, numbers, hyphen) and lowercase letters
if not re.match("^[a-z0-9-]+$", container_name):
raise ValueError(
f"Container name must only contain:\n- lowercase letters\n- numbers\n- or hyphens\nName provided was {container_name}."
)
# Check for consecutive hyphens
if "--" in container_name:
raise ValueError(
f"Container name cannot contain consecutive hyphens. Name provided was {container_name}."
)
# Check for hyphens at the end of the name
if container_name[-1] == "-":
raise ValueError(
f"Container name cannot end with a hyphen. Name provided was {container_name}."
)
2025-01-28 00:34:04 -05:00
def get_cosmos_container_store_client():
try:
azure_client_manager = AzureClientManager()
return azure_client_manager.get_cosmos_container_client(
database="graphrag", container="container-store"
)
except Exception:
logger = load_pipeline_logger()
logger.error("Error fetching cosmosdb client.")
raise HTTPException(status_code=500, detail="Error fetching cosmosdb client.")
async def get_blob_container_client(name: str):
try:
azure_client_manager = AzureClientManager()
blob_service_client = azure_client_manager.get_blob_service_client_async()
container_client = blob_service_client.get_container_client(name)
if not await container_client.exists():
await container_client.create_container()
return container_client
except Exception:
logger = load_pipeline_logger()
logger.error("Error fetching storage client.")
raise HTTPException(status_code=500, detail="Error fetching storage client.")
def sanitize_name(container_name: str | None) -> str | None:
2024-06-26 15:45:06 -04:00
"""
2025-01-28 00:34:04 -05:00
Sanitize a user-provided string to be used as an Azure Storage container name.
Convert the string to a SHA256 hash, then truncate to 128 bit length to ensure
it is within the 63 character limit imposed by Azure Storage.
2024-06-26 15:45:06 -04:00
The sanitized name will be used to identify container names in both Azure Storage and CosmosDB.
Args:
-----
name (str)
The name to be sanitized.
Returns: str
The sanitized name.
"""
2025-01-28 00:34:04 -05:00
if not container_name:
2024-06-26 15:45:06 -04:00
return None
2025-01-28 00:34:04 -05:00
container_name = container_name.encode()
hashed_name = hashlib.sha256(container_name)
truncated_hash = hashed_name.digest()[:16] # get the first 16 bytes (128 bits)
2024-06-26 15:45:06 -04:00
return truncated_hash.hex()
2025-01-28 00:34:04 -05:00
def desanitize_name(sanitized_container_name: str) -> str | None:
2024-06-26 15:45:06 -04:00
"""
2025-01-28 00:34:04 -05:00
Retrieve the original user-provided name of a sanitized container name.
2024-06-26 15:45:06 -04:00
Args:
-----
sanitized_name (str)
The sanitized name to be converted back to the original name.
Returns: str
The original human-readable name.
"""
try:
2025-01-28 00:34:04 -05:00
container_store_client = get_cosmos_container_store_client()
2024-12-30 01:59:08 -05:00
try:
2025-01-28 00:34:04 -05:00
return container_store_client.read_item(
sanitized_container_name, sanitized_container_name
)["human_readable_name"]
2024-12-30 01:59:08 -05:00
except exceptions.CosmosResourceNotFoundError:
return None
2024-06-26 15:45:06 -04:00
except Exception:
raise HTTPException(
2025-01-28 00:34:04 -05:00
status_code=500, detail="Error retrieving original container name."
2024-06-26 15:45:06 -04:00
)