{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## SCRIPT TO EXTRACT EXISTING TEXT EMBEDDINGS INTO A NEW WORKFLOW WITH NEW LOOKUP TABLES" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.insert(1, '../../')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### SET VALUES FOR THE INDEX FOLDER TO BE EXTRACTED" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# set local folder where the index data is located\n", "LOCAL_ROOT = \"/Users/gaudy-microsoft/Repositories/unified-copilot/app/data/CHRISTMAS-CAROL\"\n", "\n", "# value to decide if the original file should maintain or remove the embedding column\n", "REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE = True\n", "\n", "#identifier field\n", "STANDARD_IDENTIFIER_FIELD = \"id\"\n", "\n", "#new embedding field name\n", "NEW_STANDARD_EMBEDDING_FIELD = \"embedding\"" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def extract_text_embedding_from_table(input_path: str, embedding_field: str, embeddings_parquet_output_field: str):\n", " \"\"\"Migrate table for embeddings.\"\"\"\n", " original_df = pd.read_parquet(input_path)\n", " no_embeddings_df = original_df.drop(columns=[embedding_field])\n", " \n", " embeddings_df = original_df[[STANDARD_IDENTIFIER_FIELD, embedding_field]]\n", " embeddings_df = embeddings_df.rename(columns={embedding_field: NEW_STANDARD_EMBEDDING_FIELD}) # type: ignore\n", " embeddings_df.to_parquet(embeddings_parquet_output_field, index=False)\n", "\n", " if REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE is True:\n", " no_embeddings_df.to_parquet(input_path, index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### EMBEDDINGS TO MIGRATE IN FILE: `create_final_community_reports.parquet`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#input file with the embedding column\n", "INPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports.parquet\"\n", "\n", "#output file for embeddings\n", "EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports_embeddings.parquet\"\n", "\n", "#output file without embeddings\n", "NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports.parquet\"\n", "\n", "#embedding field\n", "EMBEDDING_FIELD = \"full_content_embedding\"\n", "\n", "extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#input file with the embedding column\n", "INPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports.parquet\"\n", "\n", "#output file for embeddings\n", "EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports_embeddings.parquet\"\n", "\n", "#output file without embeddings\n", "NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports.parquet\"\n", "\n", "#embedding field\n", "EMBEDDING_FIELD = \"summary_embedding\"\n", "\n", "extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#input file with the embedding column\n", "INPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports.parquet\"\n", "\n", "#output file for embeddings\n", "EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports_embeddings.parquet\"\n", "\n", "#output file without embeddings\n", "NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports.parquet\"\n", "\n", "#embedding field\n", "EMBEDDING_FIELD = \"title_embedding\"\n", "\n", "extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### EMBEDDINGS TO MIGRATE IN FILE: `create_final_documents.parquet`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#input file with the embedding column\n", "INPUT_PATH = f\"{LOCAL_ROOT}/create_final_documents.parquet\"\n", "\n", "#output file for embeddings\n", "EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_documents_embeddings.parquet\"\n", "\n", "#output file without embeddings\n", "NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_documents.parquet\"\n", "\n", "#embedding field\n", "EMBEDDING_FIELD = \"raw_content_embedding\"\n", "\n", "extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### EMBEDDINGS TO MIGRATE IN FILE: `create_final_entities.parquet`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#input file with the embedding column\n", "INPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities.parquet\"\n", "\n", "#output file for embeddings\n", "EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities_embeddings.parquet\"\n", "\n", "#output file without embeddings\n", "NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities.parquet\"\n", "\n", "#embedding field\n", "EMBEDDING_FIELD = \"name_embedding\"\n", "\n", "extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#input file with the embedding column\n", "INPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities.parquet\"\n", "\n", "#output file for embeddings\n", "EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities_embeddings.parquet\"\n", "\n", "#output file without embeddings\n", "NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities.parquet\"\n", "\n", "#embedding field\n", "EMBEDDING_FIELD = \"description_embedding\"\n", "\n", "extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### EMBEDDINGS TO MIGRATE IN FILE: `create_final_text_units.parquet`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#input file with the embedding column\n", "INPUT_PATH = f\"{LOCAL_ROOT}/create_final_text_units.parquet\"\n", "\n", "#output file for embeddings\n", "EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_text_units_embeddings.parquet\"\n", "\n", "#output file without embeddings\n", "NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_text_units.parquet\"\n", "\n", "#embedding field\n", "EMBEDDING_FIELD = \"text_embedding\"\n", "\n", "extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.10" } }, "nbformat": 4, "nbformat_minor": 2 }