2024-10-02 22:00:54 -06:00
|
|
|
{
|
|
|
|
"cells": [
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"## SCRIPT TO EXTRACT EXISTING TEXT EMBEDDINGS INTO A NEW WORKFLOW WITH NEW LOOKUP TABLES"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-10-10 12:05:41 -06:00
|
|
|
"execution_count": 42,
|
2024-10-02 22:00:54 -06:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"import sys\n",
|
2024-10-10 12:05:41 -06:00
|
|
|
"\n",
|
|
|
|
"sys.path.insert(1, \"../../\")"
|
2024-10-02 22:00:54 -06:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-10-10 12:05:41 -06:00
|
|
|
"execution_count": 43,
|
2024-10-02 22:00:54 -06:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2024-10-10 12:05:41 -06:00
|
|
|
"import re\n",
|
|
|
|
"from pathlib import Path\n",
|
|
|
|
"\n",
|
|
|
|
"import pandas as pd\n"
|
2024-10-02 22:00:54 -06:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"### SET VALUES FOR THE INDEX FOLDER TO BE EXTRACTED"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-10-10 12:05:41 -06:00
|
|
|
"execution_count": 44,
|
2024-10-02 22:00:54 -06:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"# set local folder where the index data is located\n",
|
2024-10-10 12:05:41 -06:00
|
|
|
"#LOCAL_ROOT = \"/Users/gaudy-microsoft/Repositories/unified-copilot/app/data/CHRISTMAS-CAROL\" # noqa: ERA001\n",
|
|
|
|
"LOCAL_ROOT = \"/Users/gaudy-microsoft/Desktop/test-mock-embeddings\"\n",
|
2024-10-02 22:00:54 -06:00
|
|
|
"\n",
|
|
|
|
"# value to decide if the original file should maintain or remove the embedding column\n",
|
2024-10-10 12:05:41 -06:00
|
|
|
"REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE = False\n",
|
2024-10-02 22:00:54 -06:00
|
|
|
"\n",
|
|
|
|
"#identifier field\n",
|
|
|
|
"STANDARD_IDENTIFIER_FIELD = \"id\"\n",
|
|
|
|
"\n",
|
|
|
|
"#new embedding field name\n",
|
|
|
|
"NEW_STANDARD_EMBEDDING_FIELD = \"embedding\""
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
2024-10-10 12:05:41 -06:00
|
|
|
"### GENERIC METHOD TO EXTRACT EMBEDDING COLUMNS FROM A FILE AND CREATE A NEW EMBEDDINGS SPECIFIC FILE"
|
2024-10-02 22:00:54 -06:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-10-10 12:05:41 -06:00
|
|
|
"execution_count": 45,
|
2024-10-02 22:00:54 -06:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2024-10-10 12:05:41 -06:00
|
|
|
"def extract_text_embedding_from_table(input_path: str, original_embedding_field: str, embeddings_parquet_output_field: str):\n",
|
|
|
|
" \"\"\"Migrate table for embeddings.\"\"\"\n",
|
|
|
|
" original_df = pd.read_parquet(input_path)\n",
|
|
|
|
" no_embeddings_df = original_df.drop(columns=[original_embedding_field])\n",
|
|
|
|
" \n",
|
|
|
|
" embeddings_df = original_df[[STANDARD_IDENTIFIER_FIELD, original_embedding_field]]\n",
|
|
|
|
" embeddings_df = embeddings_df.rename(columns={original_embedding_field: NEW_STANDARD_EMBEDDING_FIELD}) # type: ignore\n",
|
|
|
|
" embeddings_df.to_parquet(embeddings_parquet_output_field, index=False)\n",
|
2024-10-02 22:00:54 -06:00
|
|
|
"\n",
|
2024-10-10 12:05:41 -06:00
|
|
|
" if REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE is True:\n",
|
|
|
|
" no_embeddings_df.to_parquet(input_path, index=False)"
|
2024-10-02 22:00:54 -06:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
2024-10-10 12:05:41 -06:00
|
|
|
"### ITERATES PER ALL PARQUET FILES INSIDE THE FOLDER AND DETECTS ALL EMBEDDINGS COLUMNS IN ALL OF THEM"
|
2024-10-02 22:00:54 -06:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2024-10-10 12:05:41 -06:00
|
|
|
"execution_count": 46,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
|
|
|
"Reading /Users/gaudy-microsoft/Desktop/test-mock-embeddings/create_final_text_units.parquet\n",
|
|
|
|
"Reading /Users/gaudy-microsoft/Desktop/test-mock-embeddings/create_final_community_reports.parquet\n"
|
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"#READ ENTIRE DATA FOLDER\n",
|
|
|
|
"folder_path = Path(LOCAL_ROOT)\n",
|
|
|
|
"pattern = r\"^(.*?)(_embedding)$\"\n",
|
|
|
|
"\n",
|
|
|
|
"for file_path in folder_path.iterdir():\n",
|
|
|
|
" if file_path.is_file() and file_path.suffix == \".parquet\":\n",
|
|
|
|
" print(f\"Reading {file_path}\")\n",
|
|
|
|
" original_df = pd.read_parquet(str(file_path))\n",
|
|
|
|
" columns = original_df.columns.tolist()\n",
|
|
|
|
"\n",
|
|
|
|
" for column in columns:\n",
|
|
|
|
" match = re.match(pattern, column)\n",
|
|
|
|
" if match:\n",
|
|
|
|
" source_column_name = match.group(1)\n",
|
|
|
|
" suffix = match.group(2)\n",
|
|
|
|
" extract_text_embedding_from_table(str(file_path), column, str(file_path.with_suffix(\"\").as_posix()) + \"_embeddings\" + file_path.suffix)\n"
|
2024-10-02 22:00:54 -06:00
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"metadata": {
|
|
|
|
"kernelspec": {
|
|
|
|
"display_name": ".venv",
|
|
|
|
"language": "python",
|
|
|
|
"name": "python3"
|
|
|
|
},
|
|
|
|
"language_info": {
|
|
|
|
"codemirror_mode": {
|
|
|
|
"name": "ipython",
|
|
|
|
"version": 3
|
|
|
|
},
|
|
|
|
"file_extension": ".py",
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
"name": "python",
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
"version": "3.11.10"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"nbformat": 4,
|
|
|
|
"nbformat_minor": 2
|
|
|
|
}
|