{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## SCRIPT TO EXTRACT EXISTING TEXT EMBEDDINGS INTO A NEW WORKFLOW WITH NEW LOOKUP TABLES" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Copyright (c) 2024 Microsoft Corporation.\n", "# Licensed under the MIT License." ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "import sys\n", "\n", "sys.path.insert(1, \"../../\")" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "import re\n", "from pathlib import Path\n", "\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### SET VALUES FOR THE INDEX FOLDER TO BE EXTRACTED" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "# set local folder where the index data is located\n", "LOCAL_ROOT = \"\"\n", "\n", "# value to decide if the original file should maintain or remove the embedding column\n", "REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE = True\n", "\n", "# identifier field\n", "STANDARD_IDENTIFIER_FIELD = \"id\"\n", "\n", "# new embedding field name\n", "NEW_STANDARD_EMBEDDING_FIELD = \"embedding\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### GENERIC METHOD TO EXTRACT EMBEDDING COLUMNS FROM A FILE AND CREATE A NEW EMBEDDINGS SPECIFIC FILE" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "def extract_text_embedding_from_table(\n", " input_path: str, original_embedding_field: str, embeddings_parquet_output_field: str\n", "):\n", " \"\"\"Migrate table for embeddings.\"\"\"\n", " original_df = pd.read_parquet(input_path)\n", " no_embeddings_df = original_df.drop(columns=[original_embedding_field])\n", "\n", " embeddings_df = original_df[[STANDARD_IDENTIFIER_FIELD, original_embedding_field]]\n", " embeddings_df = embeddings_df.rename(\n", " columns={original_embedding_field: NEW_STANDARD_EMBEDDING_FIELD}\n", " ) # type: ignore\n", " embeddings_df.to_parquet(embeddings_parquet_output_field, index=False)\n", "\n", " if REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE is True:\n", " no_embeddings_df.to_parquet(input_path, index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### ITERATES PER ALL PARQUET FILES INSIDE THE FOLDER AND DETECTS ALL EMBEDDINGS COLUMNS IN ALL OF THEM" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# READ ENTIRE DATA FOLDER LOOKING FOR EMBEDDING COLUMNS IN EACH FILE\n", "folder_path = Path(LOCAL_ROOT)\n", "pattern = r\"^(.*?)(_embedding)$\"\n", "\n", "for file_path in folder_path.iterdir():\n", " if file_path.is_file() and file_path.suffix == \".parquet\":\n", " original_df = pd.read_parquet(str(file_path))\n", " columns = original_df.columns.tolist()\n", "\n", " for column in columns:\n", " match = re.match(pattern, column)\n", " if match:\n", " print(f\"Reading {file_path}\")\n", " filename_without_extension = str(file_path.with_suffix(\"\").as_posix())\n", " embedding_file_name = (\n", " f\"{filename_without_extension}_{column}s{file_path.suffix}\"\n", " )\n", " extract_text_embedding_from_table(\n", " str(file_path), column, embedding_file_name\n", " )" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.10" } }, "nbformat": 4, "nbformat_minor": 2 }