graphrag/migration_scripts/extract_text_embeddings.ipynb

157 lines
4.3 KiB
Plaintext
Raw Permalink Normal View History

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## SCRIPT TO EXTRACT EXISTING TEXT EMBEDDINGS INTO A NEW WORKFLOW WITH NEW LOOKUP TABLES"
]
},
2024-10-10 16:17:21 -06:00
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Copyright (c) 2024 Microsoft Corporation.\n",
"# Licensed under the MIT License."
]
},
{
"cell_type": "code",
2024-10-10 15:31:46 -06:00
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
2024-10-10 12:05:41 -06:00
"\n",
"sys.path.insert(1, \"../../\")"
]
},
{
"cell_type": "code",
2024-10-10 15:31:46 -06:00
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
2024-10-10 12:05:41 -06:00
"import re\n",
"from pathlib import Path\n",
"\n",
2024-10-10 15:39:40 -06:00
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### SET VALUES FOR THE INDEX FOLDER TO BE EXTRACTED"
]
},
{
"cell_type": "code",
2024-10-10 15:31:46 -06:00
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"# set local folder where the index data is located\n",
2024-10-10 15:36:07 -06:00
"LOCAL_ROOT = \"<local-path-to-data-folder>\"\n",
"\n",
"# value to decide if the original file should maintain or remove the embedding column\n",
2024-10-10 15:29:35 -06:00
"REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE = True\n",
"\n",
2024-10-10 15:39:40 -06:00
"# identifier field\n",
"STANDARD_IDENTIFIER_FIELD = \"id\"\n",
"\n",
2024-10-10 15:39:40 -06:00
"# new embedding field name\n",
"NEW_STANDARD_EMBEDDING_FIELD = \"embedding\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-10-10 12:05:41 -06:00
"### GENERIC METHOD TO EXTRACT EMBEDDING COLUMNS FROM A FILE AND CREATE A NEW EMBEDDINGS SPECIFIC FILE"
]
},
{
"cell_type": "code",
2024-10-10 15:31:46 -06:00
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
2024-10-10 15:39:40 -06:00
"def extract_text_embedding_from_table(\n",
" input_path: str, original_embedding_field: str, embeddings_parquet_output_field: str\n",
"):\n",
2024-10-10 12:05:41 -06:00
" \"\"\"Migrate table for embeddings.\"\"\"\n",
" original_df = pd.read_parquet(input_path)\n",
" no_embeddings_df = original_df.drop(columns=[original_embedding_field])\n",
2024-10-10 15:39:40 -06:00
"\n",
2024-10-10 12:05:41 -06:00
" embeddings_df = original_df[[STANDARD_IDENTIFIER_FIELD, original_embedding_field]]\n",
2024-10-10 15:39:40 -06:00
" embeddings_df = embeddings_df.rename(\n",
" columns={original_embedding_field: NEW_STANDARD_EMBEDDING_FIELD}\n",
" ) # type: ignore\n",
2024-10-10 12:05:41 -06:00
" embeddings_df.to_parquet(embeddings_parquet_output_field, index=False)\n",
"\n",
2024-10-10 12:05:41 -06:00
" if REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE is True:\n",
" no_embeddings_df.to_parquet(input_path, index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-10-10 12:05:41 -06:00
"### ITERATES PER ALL PARQUET FILES INSIDE THE FOLDER AND DETECTS ALL EMBEDDINGS COLUMNS IN ALL OF THEM"
]
},
{
"cell_type": "code",
2024-10-10 15:29:35 -06:00
"execution_count": null,
2024-10-10 12:05:41 -06:00
"metadata": {},
2024-10-10 15:29:35 -06:00
"outputs": [],
2024-10-10 12:05:41 -06:00
"source": [
2024-10-10 15:39:40 -06:00
"# READ ENTIRE DATA FOLDER LOOKING FOR EMBEDDING COLUMNS IN EACH FILE\n",
2024-10-10 12:05:41 -06:00
"folder_path = Path(LOCAL_ROOT)\n",
"pattern = r\"^(.*?)(_embedding)$\"\n",
"\n",
"for file_path in folder_path.iterdir():\n",
" if file_path.is_file() and file_path.suffix == \".parquet\":\n",
" original_df = pd.read_parquet(str(file_path))\n",
" columns = original_df.columns.tolist()\n",
"\n",
" for column in columns:\n",
" match = re.match(pattern, column)\n",
" if match:\n",
2024-10-10 15:29:35 -06:00
" print(f\"Reading {file_path}\")\n",
" filename_without_extension = str(file_path.with_suffix(\"\").as_posix())\n",
2024-10-10 15:39:40 -06:00
" embedding_file_name = (\n",
" f\"{filename_without_extension}_{column}s{file_path.suffix}\"\n",
" )\n",
" extract_text_embedding_from_table(\n",
" str(file_path), column, embedding_file_name\n",
" )"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}