diff --git a/migration_scripts/extract_graph_embeddings.ipynb b/migration_scripts/extract_graph_embeddings.ipynb deleted file mode 100644 index 648c0daa..00000000 --- a/migration_scripts/extract_graph_embeddings.ipynb +++ /dev/null @@ -1,144 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## SCRIPT TO EXTRACT EXISTING GRAPH EMBEDDINGS INTO A NEW WORKFLOW WITH NEW LOOKUP TABLES" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "sys.path.insert(1, '../../')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SET VALUES FOR THE INDEX FOLDER TO BE EXTRACTED" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# set local folder where the index data is located\n", - "LOCAL_ROOT = \"/Users/gaudy-microsoft/Repositories/unified-copilot/app/data/CHRISTMAS-CAROL\"\n", - "\n", - "# value to decide if the original file should maintain or remove the embedding column\n", - "REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE = True\n", - "\n", - "#identifier field\n", - "STANDARD_IDENTIFIER_FIELD = \"id\"\n", - "\n", - "#new embedding field name\n", - "NEW_STANDARD_EMBEDDING_FIELD = \"embedding\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def extract_graph_embedding_from_table(input_path: str, embedding_field: str, embeddings_parquet_output_field: str):\n", - " \"\"\"Migrate table for embeddings.\"\"\"\n", - " original_df = pd.read_parquet(input_path)\n", - " no_embeddings_df = original_df.drop(columns=[embedding_field])\n", - " \n", - " embeddings_df = original_df[[STANDARD_IDENTIFIER_FIELD, embedding_field]]\n", - " embeddings_df = embeddings_df.rename(columns={embedding_field: NEW_STANDARD_EMBEDDING_FIELD}) # type: ignore\n", - " embeddings_df.to_parquet(embeddings_parquet_output_field, index=False)\n", - "\n", - " if REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE is True:\n", - " no_embeddings_df.to_parquet(input_path, index=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### EMBEDDINGS TO MIGRATE IN FILE: `create_base_entity_graph.parquet`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#input file with the embedding column\n", - "INPUT_PATH = f\"{LOCAL_ROOT}/create_base_entity_graph.parquet\"\n", - "\n", - "#output file for embeddings\n", - "EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_base_entity_graph_embeddings.parquet\"\n", - "\n", - "#output file without embeddings\n", - "NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_base_entity_graph.parquet\"\n", - "\n", - "#embedding field\n", - "EMBEDDING_FIELD = \"embeddings\"\n", - "\n", - "extract_graph_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### EMBEDDINGS TO MIGRATE IN FILE: `create_final_entities.parquet`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#input file with the embedding column\n", - "INPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities.parquet\"\n", - "\n", - "#output file for embeddings\n", - "EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities_embeddings.parquet\"\n", - "\n", - "#output file without embeddings\n", - "NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities.parquet\"\n", - "\n", - "#embedding field\n", - "EMBEDDING_FIELD = \"graph_embedding\"\n", - "\n", - "extract_graph_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.10" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/migration_scripts/extract_text_embeddings.ipynb b/migration_scripts/extract_text_embeddings.ipynb index 5ab99bbb..22c39fea 100644 --- a/migration_scripts/extract_text_embeddings.ipynb +++ b/migration_scripts/extract_text_embeddings.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ @@ -39,12 +39,12 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "# set local folder where the index data is located\n", - "LOCAL_ROOT = \"/Users/gaudy-microsoft/Desktop/output-con-embeddings\"\n", + "LOCAL_ROOT = \"/Users/gaudy-microsoft/Repositories/unified-copilot/app/data/CHRISTMAS-CAROL\"\n", "\n", "# value to decide if the original file should maintain or remove the embedding column\n", "REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE = True\n", @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [