script update

2026-01-07 12:40:58 +00:00 · 2024-10-10 15:31:46 -06:00 · 2024-10-10 15:31:46 -06:00 · 6e24503dc7
commit 6e24503dc7
parent 5eac54ce02
2 changed files with 5 additions and 149 deletions
--- a/migration_scripts/extract_graph_embeddings.ipynb
+++ b/migration_scripts/extract_graph_embeddings.ipynb
@ -1,144 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## SCRIPT TO EXTRACT EXISTING GRAPH EMBEDDINGS INTO A NEW WORKFLOW WITH NEW LOOKUP TABLES"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import sys\n",
-    "sys.path.insert(1, '../../')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### SET VALUES FOR THE INDEX FOLDER TO BE EXTRACTED"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# set local folder where the index data is located\n",
-    "LOCAL_ROOT = \"/Users/gaudy-microsoft/Repositories/unified-copilot/app/data/CHRISTMAS-CAROL\"\n",
-    "\n",
-    "# value to decide if the original file should maintain or remove the embedding column\n",
-    "REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE = True\n",
-    "\n",
-    "#identifier field\n",
-    "STANDARD_IDENTIFIER_FIELD = \"id\"\n",
-    "\n",
-    "#new embedding field name\n",
-    "NEW_STANDARD_EMBEDDING_FIELD = \"embedding\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def extract_graph_embedding_from_table(input_path: str, embedding_field: str, embeddings_parquet_output_field: str):\n",
-    "    \"\"\"Migrate table for embeddings.\"\"\"\n",
-    "    original_df = pd.read_parquet(input_path)\n",
-    "    no_embeddings_df = original_df.drop(columns=[embedding_field])\n",
-    "    \n",
-    "    embeddings_df = original_df[[STANDARD_IDENTIFIER_FIELD, embedding_field]]\n",
-    "    embeddings_df = embeddings_df.rename(columns={embedding_field: NEW_STANDARD_EMBEDDING_FIELD}) # type: ignore\n",
-    "    embeddings_df.to_parquet(embeddings_parquet_output_field, index=False)\n",
-    "\n",
-    "    if REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE is True:\n",
-    "        no_embeddings_df.to_parquet(input_path, index=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### EMBEDDINGS TO MIGRATE IN FILE: `create_base_entity_graph.parquet`"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#input file with the embedding column\n",
-    "INPUT_PATH = f\"{LOCAL_ROOT}/create_base_entity_graph.parquet\"\n",
-    "\n",
-    "#output file for embeddings\n",
-    "EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_base_entity_graph_embeddings.parquet\"\n",
-    "\n",
-    "#output file without embeddings\n",
-    "NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_base_entity_graph.parquet\"\n",
-    "\n",
-    "#embedding field\n",
-    "EMBEDDING_FIELD = \"embeddings\"\n",
-    "\n",
-    "extract_graph_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### EMBEDDINGS TO MIGRATE IN FILE: `create_final_entities.parquet`"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#input file with the embedding column\n",
-    "INPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities.parquet\"\n",
-    "\n",
-    "#output file for embeddings\n",
-    "EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities_embeddings.parquet\"\n",
-    "\n",
-    "#output file without embeddings\n",
-    "NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities.parquet\"\n",
-    "\n",
-    "#embedding field\n",
-    "EMBEDDING_FIELD = \"graph_embedding\"\n",
-    "\n",
-    "extract_graph_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "name": "python",
-   "version": "3.11.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/migration_scripts/extract_text_embeddings.ipynb
+++ b/migration_scripts/extract_text_embeddings.ipynb
@ -9,7 +9,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
@ -20,7 +20,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 70,
+   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
@ -39,12 +39,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set local folder where the index data is located\n",
-    "LOCAL_ROOT = \"/Users/gaudy-microsoft/Desktop/output-con-embeddings\"\n",
+    "LOCAL_ROOT = \"/Users/gaudy-microsoft/Repositories/unified-copilot/app/data/CHRISTMAS-CAROL\"\n",
    "\n",
    "# value to decide if the original file should maintain or remove the embedding column\n",
    "REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE = True\n",
@ -65,7 +65,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [