chore(database): add long db identifiers shortening algorithm

This commit is contained in:
Ben Irvin 2024-02-26 16:22:34 +01:00 committed by GitHub
parent 6111d69ad8
commit c338fa844e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 529 additions and 26 deletions

View File

@ -57,7 +57,6 @@ export const createMetadata = (models: Model[] = []): Metadata => {
createAttribute(attributeName, attribute);
} catch (error) {
console.log(error);
if (error instanceof Error) {
throw new Error(
`Error on attribute ${attributeName} in model ${meta.singularName}(${meta.uid}): ${error.message}`

View File

@ -0,0 +1,296 @@
import {
HASH_LENGTH,
HASH_SEPARATOR,
IDENTIFIER_SEPARATOR,
MIN_TOKEN_LENGTH,
createHash,
getNameFromTokens,
getShortenedName,
} from '../shortener';
describe('identifiers', () => {
describe('constants', () => {
test('HASH_LENGTH === 5', () => {
expect(HASH_LENGTH).toBe(5);
});
test('HASH_SEPARATOR === empty string', () => {
expect(HASH_SEPARATOR).toBe('');
});
});
describe('createHash', () => {
test('works with even number length', () => {
const res = createHash('123456789', 2);
expect(res.length).toBe(2);
expect(res).toEqual('24');
const res2 = createHash('123456789', 4);
expect(res2.length).toBe(4);
expect(res2).toEqual('2434');
});
test('works with odd number length', () => {
const res = createHash('123456789', 3);
expect(res.length).toBe(3);
expect(res).toEqual('243');
});
test('works with length longer than input', () => {
const res = createHash('123456789', 50);
expect(res.length).toBe(50);
expect(res).toEqual('24347b9c4b6da2fc9cde08c87f33edd2e603c8dcd6840e6b39');
});
test('throws with len === 0', () => {
expect(() => createHash('123456789', 0)).toThrow('length must be a positive integer');
});
test('throws when len < 0', () => {
expect(() => createHash('123456789', -3)).toThrow('length must be a positive integer');
});
test('throws when len invalid data type', () => {
// @ts-expect-error test bad input type
expect(() => createHash('123456789', '10')).toThrow('length must be a positive integer');
});
});
describe('tokenWithHash', () => {
test('does not add hash when len == input length', () => {
const res = getShortenedName('1234567890', 10);
expect(res).toEqual('1234567890');
});
test('returns original string when len > input length', () => {
const res = getShortenedName('1234567890', 100);
expect(res).toEqual('1234567890');
});
test('throws when len < HASH_LENGTH + MIN_TOKEN_LENGTH', () => {
expect(() => getShortenedName('1234567890', HASH_LENGTH + MIN_TOKEN_LENGTH - 1)).toThrow(
'length for part of identifier too short, minimum is hash length (5) plus min token length (3), received 7'
);
});
test('adds hash when len < input length (with correct length)', () => {
const len = 9;
const res = getShortenedName('1234567890', len);
expect(res).toEqual('1234cd65a');
expect(res.length).toBe(len);
});
test('adds hash when len == HASH_LENGTH + MIN_TOKEN_LENGTH', () => {
const res = getShortenedName('1234567890', 9);
expect(res).toEqual('1234cd65a');
});
test('throws when len === 0', () => {
expect(() => getShortenedName('1234567890', 0)).toThrow('length must be a positive integer');
});
test('throws when len < 0', () => {
expect(() => getShortenedName('1234567890', -3)).toThrow('length must be a positive integer');
});
test('throws when len invalid data type', () => {
// @ts-expect-error test bad input type
expect(() => getShortenedName('1234567890', '10')).toThrow(
'length must be a positive integer'
);
});
});
describe('getNameFromTokens', () => {
test('does not shorten strings that fit in min length', () => {
const name = getNameFromTokens(
[
{ name: '1234567890', compressible: true },
{ name: '12345', compressible: true },
{ name: 'links', compressible: false },
],
22
);
expect(name).toEqual('1234567890_12345_links');
});
test('supports strings with separator in them already', () => {
const name = getNameFromTokens(
[
{ name: '1234_56789', compressible: true },
{ name: '123_4', compressible: true },
{ name: 'links', compressible: false },
],
22
);
expect(name).toEqual('1234_56789_123_4_links');
});
test('shortens string that does not fit in min length (one compressible)', () => {
const name = getNameFromTokens([{ name: '123456789012345', compressible: true }], 13);
expect(name).toEqual('1234567878db8');
});
test('shortens strings with separator in them already (last char before hash)', () => {
const name = getNameFromTokens([{ name: '1234567_9012345', compressible: true }], 13);
expect(name).toEqual('1234567_47b4e');
});
test('shortens strings with separator in them already (past the hash)', () => {
const name = getNameFromTokens([{ name: '12345678_012345', compressible: true }], 13);
expect(name).toEqual('12345678867f6');
});
test('returns original string when it fits (one compressible)', () => {
const name = getNameFromTokens([{ name: '12345', compressible: true }], 5);
expect(name).toEqual('12345');
const name2 = getNameFromTokens([{ name: '12345', compressible: true }], 10);
expect(name2).toEqual('12345');
});
test('shortens long string that do not fit in min length (two compressible one of which short, one suffix)', () => {
const name = getNameFromTokens(
[
{ name: '1234567890', compressible: true },
{ name: '12345', compressible: true },
{ name: 'links', compressible: false },
],
21
);
expect(name).toEqual('1234cd65a_12345_links');
});
test('throws when cannot compress without violating min length rules', () => {
expect(() =>
getNameFromTokens(
[
{ name: '1234567890', compressible: true },
{ name: '1234567890', compressible: true },
{ name: 'links', compressible: false },
],
21
)
).toThrow('Maximum length is too small to accommodate all tokens');
});
test('shortens two long strings when maxLength is the shortest possible', () => {
const separatorsNeeded = 2;
const incompressibleString = 'links';
const compressibleStrings = 2;
const len =
(MIN_TOKEN_LENGTH + HASH_LENGTH) * compressibleStrings +
incompressibleString.length +
IDENTIFIER_SEPARATOR.length * separatorsNeeded;
expect(len).toBe(23);
const name = getNameFromTokens(
[
{ name: '1234567890', compressible: true },
{ name: '1234567890', compressible: true },
{ name: incompressibleString, compressible: false },
],
len
);
expect(name).toEqual('123cd65a_123cd65a_links');
});
test('works with max capacity', () => {
const res = getNameFromTokens(
[
{ name: '12', compressible: true },
{ name: '12', compressible: true },
{ name: '12', compressible: true },
{ name: '12', compressible: true },
],
12
);
expect(res).toBe('12_12_12_12');
});
test('throws when compressible strings cannot fit', () => {
expect(() =>
getNameFromTokens(
[
{ name: '12', compressible: true },
{ name: '12', compressible: true },
{ name: '12', compressible: true },
{ name: '1', compressible: true },
{ name: '12', compressible: true },
],
12
)
).toThrow('Maximum length is too small to accommodate all tokens');
});
test('throws when incompressible string cannot fit', () => {
expect(() => getNameFromTokens([{ name: '123456', compressible: false }], 5)).toThrow(
'Maximum length is too small to accommodate all tokens'
);
});
test('throws when incompressible strings cannot fit due to separators', () => {
expect(() =>
getNameFromTokens(
[
{ name: '123456', compressible: false },
{ name: '123456', compressible: false },
],
12
)
).toThrow('Maximum length is too small to accommodate all tokens');
});
test('shortens strings that result in exactly maxLength (three compressible, suffix)', () => {
const name = getNameFromTokens(
[
{ name: '1234567890', compressible: true },
{ name: '12345', compressible: true },
{ name: '0987654321', compressible: true },
{ name: 'links', compressible: false },
],
30
);
expect(name.length).toEqual(30);
expect(name).toEqual('1234cd65a_12345_0984addb_links');
});
});
test('shortens strings that do not fit in min length (three compressible, prefix)', () => {
const name = getNameFromTokens(
[
{ name: 'inv_order', compressible: false },
{ name: '1234567890', compressible: true },
{ name: '12345', compressible: true },
{ name: '0987654321', compressible: true },
],
34
);
expect(name.length).toEqual(34);
expect(name).toEqual('inv_order_1234cd65a_12345_0984addb');
});
test('shortens strings that do not fit in min length (three compressible, suffix, prefix, and infix)', () => {
const name = getNameFromTokens(
[
{ name: 'pre', compressible: false },
{ name: '1234567890', compressible: true },
{ name: 'in', compressible: false },
{ name: '3456789012', compressible: true },
{ name: 'post', compressible: false },
],
31
);
expect(name.length).toEqual(31);
expect(name).toEqual('pre_1234cd65a_in_3456be378_post');
});
test('redistributes perfectly to max length even with same length long strings where one must be shortened (three compressible, suffix, prefix, and infix)', () => {
const name = getNameFromTokens(
[
{ name: 'pre', compressible: false },
{ name: '1234567890', compressible: true },
{ name: 'in', compressible: false },
{ name: '3456789012', compressible: true },
{ name: 'post', compressible: false },
],
32
);
expect(name.length).toEqual(32);
expect(name).toEqual('pre_1234567890_in_3456be378_post');
});
test('works for max length incompressibles', () => {
const name = getNameFromTokens(
[
{ name: '1234567890', compressible: false },
{ name: '2345678901', compressible: false },
{ name: '3456789012', compressible: false },
],
34
);
expect(name).toEqual('1234567890_2345678901_3456789012');
});
});

View File

@ -1,18 +1,12 @@
import _ from 'lodash/fp';
import { getNameFromTokens } from './shortener';
// TODO: Names will not be shortened until this is set to a non-zero number
export const MAX_DB_IDENTIFIER_LENGTH = 0;
// Constants for column names used in naming methods
export const ENTITY = 'entity';
export const ID_COLUMN = 'id';
export const ORDER_COLUMN = 'order';
export const FIELD_COLUMN = 'field';
type NameToken = {
name: string;
compressible: boolean;
};
type NameInput = string | string[];
type NameOptions = {
@ -21,23 +15,13 @@ type NameOptions = {
maxLength?: number;
};
export const getNameFromTokens = (tokens: NameToken[], max: number = MAX_DB_IDENTIFIER_LENGTH) => {
const fullLength = tokens
.map((token) => {
return _.snakeCase(token.name);
})
.join('_');
if (!max || fullLength.length <= max) {
return fullLength;
}
// TODO: this is where the shortening algorithm goes
return fullLength;
};
// Generic name handler used by all helper functions
// Generic name handler that must be used by all helper functions
/**
* TODO: we should be requiring snake_case inputs for all names here, but we
* aren't and it will require some refactoring to make it work. Currently if
* we get names 'myModel' and 'my_model' they would be converted to the same
* final string my_model which generally works but is not entirely safe
* */
export const getName = (names: NameInput, options: NameOptions = {}) => {
const tokens = _.castArray(names).map((name) => {
return {

View File

@ -0,0 +1,224 @@
/**
* @fileoverview This file contains utility functions for shortening identifiers for use in a database schema.
* The functions in this file are used to generate shorter names for database tables and columns
* to avoid breaking the constraints of databases.
*
* IMPORTANT
* Any changes here that result in a different output string from any of the naming methods will
* cause the schema creation to delete data it doesn't recognize because the name
* is different.
*
* If there are any test failures after updating this code, it means there is a breaking change that
* will cause data loss, so beware; do not update the test to match your changes
*
* @internal
*/
import crypto from 'node:crypto';
import { partition, isInteger, sumBy, snakeCase } from 'lodash/fp';
// TODO: Names will not be shortened until this is set to a non-zero number (most likely 55)
export const MAX_DB_IDENTIFIER_LENGTH = 0;
// We can accept a number of compressible tokens up to:
// tokens accepted = (MAX_LENGTH / (HASH_LENGTH + MIN_TOKEN_LENGTH) + (tokens * IDENTIFIER_SEPARATER.length))
// Be aware of that when considering changing these values, we should be able to support at least 4 compressible identifiers
export const HASH_LENGTH = 5;
export const HASH_SEPARATOR = ''; // no separator is needed, we will just attach hash directly to shortened name
export const IDENTIFIER_SEPARATOR = '_';
export const MIN_TOKEN_LENGTH = 3;
type NameToken = {
allocatedLength?: number;
name: string;
compressible: boolean;
};
type NameTokenWithAllocation = NameToken & { allocatedLength: number };
/**
* Creates a hash of the given data with the specified string length as a string of hex characters
*
* @example
* createHash("myData", 5); // "03f85"
* createHash("myData", 2); // "03"
* createHash("myData", 1); // "0"
*
* @param data - The data to be hashed
* @param len - The length of the hash
* @returns The generated hash
* @throws Error if the length is not a positive integer
* @internal
*/
export function createHash(data: string, len: number): string {
if (!isInteger(len) || len <= 0) {
throw new Error(`createHash length must be a positive integer, received ${len}`);
}
// TODO: shake256 is based on SHA-3 and is slow, we don't care about cryptographically secure, only uniqueness and speed
// investigate alternatives before releasing this. But it is only run on startup, so it should be fine.
const hash = crypto.createHash('shake256', { outputLength: Math.ceil(len / 2) }).update(data);
return hash.digest('hex').substring(0, len);
}
/**
* Generates a string with a max length, appending a hash at the end if necessary to keep it unique
*
* @example
* // if we have strings such as "longstring1" and "longstring2" with a max length of 9,
* // we don't want to end up with "longstrin" and "longstrin"
* // we want something such as "longs0b23" and "longs953f"
* const token1 = generateToken("longstring1", 9); // "longs0b23"
* const token2 = generateToken("longstring2", 9); // "longs953f"
*
* @param name - The base name
* @param len - The desired length of the token.
* @returns The generated token with hash.
* @throws Error if the length is not a positive integer, or if the length is too short for the token.
* @internal
*/
export function getShortenedName(name: string, len: number) {
if (!isInteger(len) || len <= 0) {
throw new Error(`tokenWithHash length must be a positive integer, received ${len}`);
}
if (name.length <= len) {
return name;
}
if (len < MIN_TOKEN_LENGTH + HASH_LENGTH) {
throw new Error(
`length for part of identifier too short, minimum is hash length (${HASH_LENGTH}) plus min token length (${MIN_TOKEN_LENGTH}), received ${len} for token ${name}`
);
}
const availableLength = len - HASH_LENGTH - HASH_SEPARATOR.length;
if (availableLength < MIN_TOKEN_LENGTH) {
throw new Error(
`length for part of identifier minimum is less than min token length (${MIN_TOKEN_LENGTH}), received ${len} for token ${name}`
);
}
return `${name.substring(0, availableLength)}${HASH_SEPARATOR}${createHash(name, HASH_LENGTH)}`;
}
/**
* Constructs a name from an array of name tokens within a specified maximum length. It ensures the final name does not exceed
* this limit by selectively compressing tokens marked as compressible. If the name exceeds the maximum length and cannot be
* compressed sufficiently, an error is thrown. This function supports dynamic adjustment of token lengths to fit within the
* maxLength constraint (that is, it will always make use of all available space), while also ensuring the preservation of
* incompressible tokens.
*
* @param {NameToken[]} nameTokens - Array of name tokens
* @param {number} [maxLength=MAX_DB_IDENTIFIER_LENGTH] - Maximum length for the final name string.
* @returns {string} The generated name string within maxLength.
* @throws {Error} If the name cannot be shortened to meet maxLength.
* @internal
*/
export function getNameFromTokens(nameTokens: NameToken[], maxLength = MAX_DB_IDENTIFIER_LENGTH) {
if (!isInteger(maxLength) || maxLength < 0) {
throw new Error('maxLength must be a positive integer or 0 (for unlimited length)');
}
const fullLengthName = nameTokens
.map((token) => snakeCase(token.name))
.join(IDENTIFIER_SEPARATOR);
// if it fits, or maxLength is disabled, return full length string
if (fullLengthName.length <= maxLength || maxLength === 0) {
return fullLengthName;
}
// Split tokens by compressibility
const [compressible, incompressible] = partition(
(token: NameToken) => token.compressible,
nameTokens
);
const totalIncompressibleLength = sumBy((token: NameToken) => token.name.length)(incompressible);
const totalSeparatorsLength = nameTokens.length * IDENTIFIER_SEPARATOR.length - 1;
const available = maxLength - totalIncompressibleLength - totalSeparatorsLength;
const availablePerToken = Math.floor(available / compressible.length);
if (
totalIncompressibleLength + totalSeparatorsLength > maxLength ||
availablePerToken < MIN_TOKEN_LENGTH
) {
throw new Error('Maximum length is too small to accommodate all tokens');
}
// Calculate the remainder from the division and add it to the surplus
let surplus = available % compressible.length;
// Check that it's even possible to proceed
const minHashedLength = HASH_LENGTH + HASH_SEPARATOR.length + MIN_TOKEN_LENGTH;
const totalLength = nameTokens.reduce((total, token) => {
if (token.compressible) {
if (token.name.length < availablePerToken) {
return total + token.name.length;
}
return total + minHashedLength;
}
return total + token.name.length;
}, nameTokens.length * IDENTIFIER_SEPARATOR.length - 1);
// Check if the maximum length is less than the total length
if (maxLength < totalLength) {
throw new Error('Maximum length is too small to accommodate all tokens');
}
// Calculate total surplus length from shorter strings and total deficit length from longer strings
let deficits: NameTokenWithAllocation[] = [];
compressible.forEach((token) => {
const actualLength = token.name.length;
if (actualLength < availablePerToken) {
surplus += availablePerToken - actualLength;
token.allocatedLength = actualLength;
} else {
token.allocatedLength = availablePerToken;
deficits.push(token as NameTokenWithAllocation);
}
});
// Redistribute surplus length to longer strings, one character at a time
// This way we avoid issues with greed and trying to handle floating points by dividing available length
function filterAndIncreaseLength(token: NameTokenWithAllocation) {
if (token.allocatedLength < token.name.length && surplus > 0) {
token.allocatedLength += 1;
surplus -= 1;
// if it hasn't reached its full length, keep it in array for next round
return token.allocatedLength < token.name.length;
}
return false; // Remove this token from the deficits array
}
// Redistribute surplus length to longer strings, one character at a time
let previousSurplus = surplus + 1; // infinite loop protection
while (surplus > 0 && deficits.length > 0) {
deficits = deficits.filter((token) => filterAndIncreaseLength(token));
// infinite loop protection; if the surplus hasn't changed, there was nothing left to distribute it to
if (surplus === previousSurplus) {
break;
}
previousSurplus = surplus;
}
// Build final string
const shortenedName = nameTokens
.map((token) => {
if (token.compressible && 'allocatedLength' in token && token.allocatedLength !== undefined) {
return getShortenedName(token.name, token.allocatedLength);
}
return token.name;
})
.join(IDENTIFIER_SEPARATOR);
// this should be unreachable, but add a final check for potential edge cases we missed
if (shortenedName.length > maxLength) {
throw new Error(
`name shortening failed to generate a name of the correct maxLength; name ${shortenedName}`
);
}
return shortenedName;
}