chore(database): add long db identifiers shortening algorithm

2025-12-25 22:23:10 +00:00 · 2024-02-26 16:22:34 +01:00 · 2024-02-26 16:22:34 +01:00 · c338fa844e
commit c338fa844e
parent 6111d69ad8
4 changed files with 529 additions and 26 deletions
--- a/packages/core/database/src/metadata/index.ts
+++ b/packages/core/database/src/metadata/index.ts
@ -57,7 +57,6 @@ export const createMetadata = (models: Model[] = []): Metadata => {

        createAttribute(attributeName, attribute);
      } catch (error) {
-        console.log(error);
        if (error instanceof Error) {
          throw new Error(
            `Error on attribute ${attributeName} in model ${meta.singularName}(${meta.uid}): ${error.message}`
--- a/packages/core/database/src/utils/identifiers/tests/identifiers.test.ts
+++ b/packages/core/database/src/utils/identifiers/tests/identifiers.test.ts
@ -0,0 +1,296 @@
+import {
+  HASH_LENGTH,
+  HASH_SEPARATOR,
+  IDENTIFIER_SEPARATOR,
+  MIN_TOKEN_LENGTH,
+  createHash,
+  getNameFromTokens,
+  getShortenedName,
+} from '../shortener';
+
+describe('identifiers', () => {
+  describe('constants', () => {
+    test('HASH_LENGTH === 5', () => {
+      expect(HASH_LENGTH).toBe(5);
+    });
+    test('HASH_SEPARATOR === empty string', () => {
+      expect(HASH_SEPARATOR).toBe('');
+    });
+  });
+
+  describe('createHash', () => {
+    test('works with even number length', () => {
+      const res = createHash('123456789', 2);
+      expect(res.length).toBe(2);
+      expect(res).toEqual('24');
+
+      const res2 = createHash('123456789', 4);
+      expect(res2.length).toBe(4);
+      expect(res2).toEqual('2434');
+    });
+    test('works with odd number length', () => {
+      const res = createHash('123456789', 3);
+      expect(res.length).toBe(3);
+      expect(res).toEqual('243');
+    });
+    test('works with length longer than input', () => {
+      const res = createHash('123456789', 50);
+      expect(res.length).toBe(50);
+      expect(res).toEqual('24347b9c4b6da2fc9cde08c87f33edd2e603c8dcd6840e6b39');
+    });
+    test('throws with len === 0', () => {
+      expect(() => createHash('123456789', 0)).toThrow('length must be a positive integer');
+    });
+    test('throws when len < 0', () => {
+      expect(() => createHash('123456789', -3)).toThrow('length must be a positive integer');
+    });
+    test('throws when len invalid data type', () => {
+      // @ts-expect-error test bad input type
+      expect(() => createHash('123456789', '10')).toThrow('length must be a positive integer');
+    });
+  });
+
+  describe('tokenWithHash', () => {
+    test('does not add hash when len == input length', () => {
+      const res = getShortenedName('1234567890', 10);
+      expect(res).toEqual('1234567890');
+    });
+    test('returns original string when len > input length', () => {
+      const res = getShortenedName('1234567890', 100);
+      expect(res).toEqual('1234567890');
+    });
+    test('throws when len < HASH_LENGTH + MIN_TOKEN_LENGTH', () => {
+      expect(() => getShortenedName('1234567890', HASH_LENGTH + MIN_TOKEN_LENGTH - 1)).toThrow(
+        'length for part of identifier too short, minimum is hash length (5) plus min token length (3), received 7'
+      );
+    });
+    test('adds hash when len < input length (with correct length)', () => {
+      const len = 9;
+      const res = getShortenedName('1234567890', len);
+      expect(res).toEqual('1234cd65a');
+      expect(res.length).toBe(len);
+    });
+    test('adds hash when len == HASH_LENGTH + MIN_TOKEN_LENGTH', () => {
+      const res = getShortenedName('1234567890', 9);
+      expect(res).toEqual('1234cd65a');
+    });
+    test('throws when len === 0', () => {
+      expect(() => getShortenedName('1234567890', 0)).toThrow('length must be a positive integer');
+    });
+    test('throws when len < 0', () => {
+      expect(() => getShortenedName('1234567890', -3)).toThrow('length must be a positive integer');
+    });
+    test('throws when len invalid data type', () => {
+      // @ts-expect-error test bad input type
+      expect(() => getShortenedName('1234567890', '10')).toThrow(
+        'length must be a positive integer'
+      );
+    });
+  });
+  describe('getNameFromTokens', () => {
+    test('does not shorten strings that fit in min length', () => {
+      const name = getNameFromTokens(
+        [
+          { name: '1234567890', compressible: true },
+          { name: '12345', compressible: true },
+          { name: 'links', compressible: false },
+        ],
+        22
+      );
+      expect(name).toEqual('1234567890_12345_links');
+    });
+
+    test('supports strings with separator in them already', () => {
+      const name = getNameFromTokens(
+        [
+          { name: '1234_56789', compressible: true },
+          { name: '123_4', compressible: true },
+          { name: 'links', compressible: false },
+        ],
+        22
+      );
+      expect(name).toEqual('1234_56789_123_4_links');
+    });
+
+    test('shortens string that does not fit in min length (one compressible)', () => {
+      const name = getNameFromTokens([{ name: '123456789012345', compressible: true }], 13);
+      expect(name).toEqual('1234567878db8');
+    });
+
+    test('shortens strings with separator in them already (last char before hash)', () => {
+      const name = getNameFromTokens([{ name: '1234567_9012345', compressible: true }], 13);
+      expect(name).toEqual('1234567_47b4e');
+    });
+
+    test('shortens strings with separator in them already (past the hash)', () => {
+      const name = getNameFromTokens([{ name: '12345678_012345', compressible: true }], 13);
+      expect(name).toEqual('12345678867f6');
+    });
+
+    test('returns original string when it fits (one compressible)', () => {
+      const name = getNameFromTokens([{ name: '12345', compressible: true }], 5);
+      expect(name).toEqual('12345');
+
+      const name2 = getNameFromTokens([{ name: '12345', compressible: true }], 10);
+      expect(name2).toEqual('12345');
+    });
+
+    test('shortens long string that do not fit in min length (two compressible one of which short, one suffix)', () => {
+      const name = getNameFromTokens(
+        [
+          { name: '1234567890', compressible: true },
+          { name: '12345', compressible: true },
+          { name: 'links', compressible: false },
+        ],
+        21
+      );
+      expect(name).toEqual('1234cd65a_12345_links');
+    });
+
+    test('throws when cannot compress without violating min length rules', () => {
+      expect(() =>
+        getNameFromTokens(
+          [
+            { name: '1234567890', compressible: true },
+            { name: '1234567890', compressible: true },
+            { name: 'links', compressible: false },
+          ],
+          21
+        )
+      ).toThrow('Maximum length is too small to accommodate all tokens');
+    });
+
+    test('shortens two long strings when maxLength is the shortest possible', () => {
+      const separatorsNeeded = 2;
+      const incompressibleString = 'links';
+      const compressibleStrings = 2;
+      const len =
+        (MIN_TOKEN_LENGTH + HASH_LENGTH) * compressibleStrings +
+        incompressibleString.length +
+        IDENTIFIER_SEPARATOR.length * separatorsNeeded;
+      expect(len).toBe(23);
+
+      const name = getNameFromTokens(
+        [
+          { name: '1234567890', compressible: true },
+          { name: '1234567890', compressible: true },
+          { name: incompressibleString, compressible: false },
+        ],
+        len
+      );
+      expect(name).toEqual('123cd65a_123cd65a_links');
+    });
+
+    test('works with max capacity', () => {
+      const res = getNameFromTokens(
+        [
+          { name: '12', compressible: true },
+          { name: '12', compressible: true },
+          { name: '12', compressible: true },
+          { name: '12', compressible: true },
+        ],
+        12
+      );
+      expect(res).toBe('12_12_12_12');
+    });
+
+    test('throws when compressible strings cannot fit', () => {
+      expect(() =>
+        getNameFromTokens(
+          [
+            { name: '12', compressible: true },
+            { name: '12', compressible: true },
+            { name: '12', compressible: true },
+            { name: '1', compressible: true },
+            { name: '12', compressible: true },
+          ],
+          12
+        )
+      ).toThrow('Maximum length is too small to accommodate all tokens');
+    });
+
+    test('throws when incompressible string cannot fit', () => {
+      expect(() => getNameFromTokens([{ name: '123456', compressible: false }], 5)).toThrow(
+        'Maximum length is too small to accommodate all tokens'
+      );
+    });
+
+    test('throws when incompressible strings cannot fit due to separators', () => {
+      expect(() =>
+        getNameFromTokens(
+          [
+            { name: '123456', compressible: false },
+            { name: '123456', compressible: false },
+          ],
+          12
+        )
+      ).toThrow('Maximum length is too small to accommodate all tokens');
+    });
+
+    test('shortens strings that result in exactly maxLength (three compressible, suffix)', () => {
+      const name = getNameFromTokens(
+        [
+          { name: '1234567890', compressible: true },
+          { name: '12345', compressible: true },
+          { name: '0987654321', compressible: true },
+          { name: 'links', compressible: false },
+        ],
+        30
+      );
+      expect(name.length).toEqual(30);
+      expect(name).toEqual('1234cd65a_12345_0984addb_links');
+    });
+  });
+  test('shortens strings that do not fit in min length (three compressible, prefix)', () => {
+    const name = getNameFromTokens(
+      [
+        { name: 'inv_order', compressible: false },
+        { name: '1234567890', compressible: true },
+        { name: '12345', compressible: true },
+        { name: '0987654321', compressible: true },
+      ],
+      34
+    );
+    expect(name.length).toEqual(34);
+    expect(name).toEqual('inv_order_1234cd65a_12345_0984addb');
+  });
+  test('shortens strings that do not fit in min length (three compressible, suffix, prefix, and infix)', () => {
+    const name = getNameFromTokens(
+      [
+        { name: 'pre', compressible: false },
+        { name: '1234567890', compressible: true },
+        { name: 'in', compressible: false },
+        { name: '3456789012', compressible: true },
+        { name: 'post', compressible: false },
+      ],
+      31
+    );
+    expect(name.length).toEqual(31);
+    expect(name).toEqual('pre_1234cd65a_in_3456be378_post');
+  });
+  test('redistributes perfectly to max length even with same length long strings where one must be shortened (three compressible, suffix, prefix, and infix)', () => {
+    const name = getNameFromTokens(
+      [
+        { name: 'pre', compressible: false },
+        { name: '1234567890', compressible: true },
+        { name: 'in', compressible: false },
+        { name: '3456789012', compressible: true },
+        { name: 'post', compressible: false },
+      ],
+      32
+    );
+    expect(name.length).toEqual(32);
+    expect(name).toEqual('pre_1234567890_in_3456be378_post');
+  });
+  test('works for max length incompressibles', () => {
+    const name = getNameFromTokens(
+      [
+        { name: '1234567890', compressible: false },
+        { name: '2345678901', compressible: false },
+        { name: '3456789012', compressible: false },
+      ],
+      34
+    );
+    expect(name).toEqual('1234567890_2345678901_3456789012');
+  });
+});
--- a/packages/core/database/src/utils/identifiers/index.ts
+++ b/packages/core/database/src/utils/identifiers/index.ts
@ -1,18 +1,12 @@
 import _ from 'lodash/fp';
+import { getNameFromTokens } from './shortener';

-// TODO: Names will not be shortened until this is set to a non-zero number
-export const MAX_DB_IDENTIFIER_LENGTH = 0;
-
+// Constants for column names used in naming methods
 export const ENTITY = 'entity';
 export const ID_COLUMN = 'id';
 export const ORDER_COLUMN = 'order';
 export const FIELD_COLUMN = 'field';

-type NameToken = {
-  name: string;
-  compressible: boolean;
-};
-
 type NameInput = string | string[];

 type NameOptions = {
@ -21,23 +15,13 @@ type NameOptions = {
  maxLength?: number;
 };

-export const getNameFromTokens = (tokens: NameToken[], max: number = MAX_DB_IDENTIFIER_LENGTH) => {
-  const fullLength = tokens
-    .map((token) => {
-      return _.snakeCase(token.name);
-    })
-    .join('_');
-
-  if (!max || fullLength.length <= max) {
-    return fullLength;
-  }
-
-  // TODO: this is where the shortening algorithm goes
-
-  return fullLength;
-};
-
-// Generic name handler used by all helper functions
+// Generic name handler that must be used by all helper functions
+/**
+ * TODO: we should be requiring snake_case inputs for all names here, but we
+ * aren't and it will require some refactoring to make it work. Currently if
+ * we get names 'myModel' and 'my_model' they would be converted to the same
+ * final string my_model which generally works but is not entirely safe
+ * */
 export const getName = (names: NameInput, options: NameOptions = {}) => {
  const tokens = _.castArray(names).map((name) => {
    return {
--- a/packages/core/database/src/utils/identifiers/shortener.ts
+++ b/packages/core/database/src/utils/identifiers/shortener.ts
@ -0,0 +1,224 @@
+/**
+ * @fileoverview This file contains utility functions for shortening identifiers for use in a database schema.
+ * The functions in this file are used to generate shorter names for database tables and columns
+ * to avoid breaking the constraints of databases.
+ *
+ * IMPORTANT
+ * Any changes here that result in a different output string from any of the naming methods will
+ * cause the schema creation to delete data it doesn't recognize because the name
+ * is different.
+ *
+ * If there are any test failures after updating this code, it means there is a breaking change that
+ * will cause data loss, so beware; do not update the test to match your changes
+ *
+ * @internal
+ */
+
+import crypto from 'node:crypto';
+import { partition, isInteger, sumBy, snakeCase } from 'lodash/fp';
+
+// TODO: Names will not be shortened until this is set to a non-zero number (most likely 55)
+export const MAX_DB_IDENTIFIER_LENGTH = 0;
+
+// We can accept a number of compressible tokens up to:
+// tokens accepted = (MAX_LENGTH / (HASH_LENGTH + MIN_TOKEN_LENGTH) + (tokens * IDENTIFIER_SEPARATER.length))
+// Be aware of that when considering changing these values, we should be able to support at least 4 compressible identifiers
+
+export const HASH_LENGTH = 5;
+export const HASH_SEPARATOR = ''; // no separator is needed, we will just attach hash directly to shortened name
+export const IDENTIFIER_SEPARATOR = '_';
+export const MIN_TOKEN_LENGTH = 3;
+
+type NameToken = {
+  allocatedLength?: number;
+  name: string;
+  compressible: boolean;
+};
+
+type NameTokenWithAllocation = NameToken & { allocatedLength: number };
+
+/**
+ * Creates a hash of the given data with the specified string length as a string of hex characters
+ *
+ * @example
+ * createHash("myData", 5); // "03f85"
+ * createHash("myData", 2); // "03"
+ * createHash("myData", 1); // "0"
+ *
+ * @param data - The data to be hashed
+ * @param len - The length of the hash
+ * @returns The generated hash
+ * @throws Error if the length is not a positive integer
+ * @internal
+ */
+export function createHash(data: string, len: number): string {
+  if (!isInteger(len) || len <= 0) {
+    throw new Error(`createHash length must be a positive integer, received ${len}`);
+  }
+
+  // TODO: shake256 is based on SHA-3 and is slow, we don't care about cryptographically secure, only uniqueness and speed
+  //       investigate alternatives before releasing this. But it is only run on startup, so it should be fine.
+  const hash = crypto.createHash('shake256', { outputLength: Math.ceil(len / 2) }).update(data);
+  return hash.digest('hex').substring(0, len);
+}
+
+/**
+ * Generates a string with a max length, appending a hash at the end if necessary to keep it unique
+ *
+ * @example
+ * // if we have strings such as "longstring1" and "longstring2" with a max length of 9,
+ * // we don't want to end up with "longstrin" and "longstrin"
+ * // we want something such as    "longs0b23" and "longs953f"
+ * const token1 = generateToken("longstring1", 9); // "longs0b23"
+ * const token2 = generateToken("longstring2", 9); // "longs953f"
+ *
+ * @param name - The base name
+ * @param len - The desired length of the token.
+ * @returns The generated token with hash.
+ * @throws Error if the length is not a positive integer, or if the length is too short for the token.
+ * @internal
+ */
+export function getShortenedName(name: string, len: number) {
+  if (!isInteger(len) || len <= 0) {
+    throw new Error(`tokenWithHash length must be a positive integer, received ${len}`);
+  }
+  if (name.length <= len) {
+    return name;
+  }
+  if (len < MIN_TOKEN_LENGTH + HASH_LENGTH) {
+    throw new Error(
+      `length for part of identifier too short, minimum is hash length (${HASH_LENGTH}) plus min token length (${MIN_TOKEN_LENGTH}), received ${len} for token ${name}`
+    );
+  }
+
+  const availableLength = len - HASH_LENGTH - HASH_SEPARATOR.length;
+  if (availableLength < MIN_TOKEN_LENGTH) {
+    throw new Error(
+      `length for part of identifier minimum is less than min token length (${MIN_TOKEN_LENGTH}), received ${len} for token ${name}`
+    );
+  }
+
+  return `${name.substring(0, availableLength)}${HASH_SEPARATOR}${createHash(name, HASH_LENGTH)}`;
+}
+
+/**
+ * Constructs a name from an array of name tokens within a specified maximum length. It ensures the final name does not exceed
+ * this limit by selectively compressing tokens marked as compressible. If the name exceeds the maximum length and cannot be
+ * compressed sufficiently, an error is thrown. This function supports dynamic adjustment of token lengths to fit within the
+ * maxLength constraint (that is, it will always make use of all available space), while also ensuring the preservation of
+ * incompressible tokens.
+ *
+ * @param {NameToken[]} nameTokens - Array of name tokens
+ * @param {number} [maxLength=MAX_DB_IDENTIFIER_LENGTH] - Maximum length for the final name string.
+ * @returns {string} The generated name string within maxLength.
+ * @throws {Error} If the name cannot be shortened to meet maxLength.
+ * @internal
+ */
+export function getNameFromTokens(nameTokens: NameToken[], maxLength = MAX_DB_IDENTIFIER_LENGTH) {
+  if (!isInteger(maxLength) || maxLength < 0) {
+    throw new Error('maxLength must be a positive integer or 0 (for unlimited length)');
+  }
+
+  const fullLengthName = nameTokens
+    .map((token) => snakeCase(token.name))
+    .join(IDENTIFIER_SEPARATOR);
+
+  // if it fits, or maxLength is disabled, return full length string
+  if (fullLengthName.length <= maxLength || maxLength === 0) {
+    return fullLengthName;
+  }
+
+  // Split tokens by compressibility
+  const [compressible, incompressible] = partition(
+    (token: NameToken) => token.compressible,
+    nameTokens
+  );
+
+  const totalIncompressibleLength = sumBy((token: NameToken) => token.name.length)(incompressible);
+  const totalSeparatorsLength = nameTokens.length * IDENTIFIER_SEPARATOR.length - 1;
+  const available = maxLength - totalIncompressibleLength - totalSeparatorsLength;
+  const availablePerToken = Math.floor(available / compressible.length);
+
+  if (
+    totalIncompressibleLength + totalSeparatorsLength > maxLength ||
+    availablePerToken < MIN_TOKEN_LENGTH
+  ) {
+    throw new Error('Maximum length is too small to accommodate all tokens');
+  }
+
+  // Calculate the remainder from the division and add it to the surplus
+  let surplus = available % compressible.length;
+
+  // Check that it's even possible to proceed
+  const minHashedLength = HASH_LENGTH + HASH_SEPARATOR.length + MIN_TOKEN_LENGTH;
+  const totalLength = nameTokens.reduce((total, token) => {
+    if (token.compressible) {
+      if (token.name.length < availablePerToken) {
+        return total + token.name.length;
+      }
+      return total + minHashedLength;
+    }
+    return total + token.name.length;
+  }, nameTokens.length * IDENTIFIER_SEPARATOR.length - 1);
+
+  // Check if the maximum length is less than the total length
+  if (maxLength < totalLength) {
+    throw new Error('Maximum length is too small to accommodate all tokens');
+  }
+
+  // Calculate total surplus length from shorter strings and total deficit length from longer strings
+  let deficits: NameTokenWithAllocation[] = [];
+  compressible.forEach((token) => {
+    const actualLength = token.name.length;
+    if (actualLength < availablePerToken) {
+      surplus += availablePerToken - actualLength;
+      token.allocatedLength = actualLength;
+    } else {
+      token.allocatedLength = availablePerToken;
+      deficits.push(token as NameTokenWithAllocation);
+    }
+  });
+
+  // Redistribute surplus length to longer strings, one character at a time
+  // This way we avoid issues with greed and trying to handle floating points by dividing available length
+  function filterAndIncreaseLength(token: NameTokenWithAllocation) {
+    if (token.allocatedLength < token.name.length && surplus > 0) {
+      token.allocatedLength += 1;
+      surplus -= 1;
+      // if it hasn't reached its full length, keep it in array for next round
+      return token.allocatedLength < token.name.length;
+    }
+    return false; // Remove this token from the deficits array
+  }
+
+  // Redistribute surplus length to longer strings, one character at a time
+  let previousSurplus = surplus + 1; // infinite loop protection
+  while (surplus > 0 && deficits.length > 0) {
+    deficits = deficits.filter((token) => filterAndIncreaseLength(token));
+
+    // infinite loop protection; if the surplus hasn't changed, there was nothing left to distribute it to
+    if (surplus === previousSurplus) {
+      break;
+    }
+    previousSurplus = surplus;
+  }
+
+  // Build final string
+  const shortenedName = nameTokens
+    .map((token) => {
+      if (token.compressible && 'allocatedLength' in token && token.allocatedLength !== undefined) {
+        return getShortenedName(token.name, token.allocatedLength);
+      }
+      return token.name;
+    })
+    .join(IDENTIFIER_SEPARATOR);
+
+  // this should be unreachable, but add a final check for potential edge cases we missed
+  if (shortenedName.length > maxLength) {
+    throw new Error(
+      `name shortening failed to generate a name of the correct maxLength; name ${shortenedName}`
+    );
+  }
+
+  return shortenedName;
+}