Skip to content

Commit

Permalink
Merge pull request #11088 from keymanapp/feat/common/models/templates…
Browse files Browse the repository at this point in the history
…/trie-compression-start

feat(common/models): add Trie string-encoding + decoding methods 💾
  • Loading branch information
jahorton authored Aug 28, 2024
2 parents 745172c + 2e92a7f commit 2093ef9
Show file tree
Hide file tree
Showing 6 changed files with 540 additions and 14 deletions.
5 changes: 5 additions & 0 deletions common/models/templates/src/trie-builder.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { SENTINEL_CODE_UNIT, Wordform2Key } from "./common.js";
import { compressNode } from "./trie-compression.js";
import { Entry, InternalNode, Leaf, Node, Trie, sortNode } from "./trie.js";

export function createRootNode(): Node {
Expand Down Expand Up @@ -143,4 +144,8 @@ export class TrieBuilder extends Trie {
getTotalWeight(): number {
return this.totalWeight;
}

compress(): string {
return compressNode(this.root);
}
}
273 changes: 273 additions & 0 deletions common/models/templates/src/trie-compression.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,273 @@
import { Wordform2Key } from "./common.js";
import { Entry, InternalNode, Leaf, Node } from "./trie.js";

// const SINGLE_CHAR_RANGE = Math.pow(2, 16) - 64;
// const ENCODED_NUM_BASE = 64;

// Offsetting by even just 0x0020 avoids control-code chars + avoids VS Code not liking the encoding.
export const ENCODED_NUM_BASE = 0x0020;
export const SINGLE_CHAR_RANGE = Math.pow(2, 16) - ENCODED_NUM_BASE;

/**
* The number of characters allocated to representing the total length of a node or entry.
*/
const NODE_SIZE_WIDTH = 2;
/**
* The number of characters allocated to representing the weight (frequency) of entries.
*/
const WEIGHT_WIDTH = 2;

export function decompressNumber(str: string, start: number, end: number) {
end ??= str.length;
let num = 0;

for(let i = start; i < end; i++) {
let val = str.charCodeAt(i);
num = num * SINGLE_CHAR_RANGE + val - ENCODED_NUM_BASE;
}

return num;
}

export function compressNumber(num: number, width?: number) {
let compressed = '';
width ||= 1;

// Note: JS bit-shift operators assume 32-bit signed ints.
// JS numbers can easily represent larger ints, though.
while(width > 0) {
const piece = num % SINGLE_CHAR_RANGE + ENCODED_NUM_BASE;
num = Math.floor(num / SINGLE_CHAR_RANGE);

compressed = String.fromCharCode(piece) + compressed;

width--;
}

if(num) {
throw new Error(`Could not properly compress ${num} within specified char width of ${width}`);
}

return compressed;
}

/**
* Length of the 'header' section for encoded entries, representing the
* total size for the entry + its weight.
*/
const ENTRY_HEADER_WIDTH = NODE_SIZE_WIDTH + WEIGHT_WIDTH;

// encoded ENTRY:
// - entryLen: 2 char
// - total encoded size of the entry, including `entryLen`
// - as raw, concatenated string data - no JSON.stringify action taken.
// - weight: 2 chars
// - frequency of the entry
// - contentLen: safe to infer from all other values
// - content: string: from [header+4] to [header+4 + contentLen - 1]
//
// - key: not encoded; we can regenerate it via the keying function

export function compressEntry(entry: Entry): string {
const { content, weight } = entry;

const weightEnc = compressNumber(weight, WEIGHT_WIDTH);

const entryLenEnc = compressNumber(content.length + ENTRY_HEADER_WIDTH, 2);

return `${entryLenEnc}${weightEnc}${content}`;
}

export function decompressEntry(str: string, keyFunction: Wordform2Key, baseIndex: number): Entry {
baseIndex ||= 0;

const entryLen = decompressNumber(str, baseIndex + 0, baseIndex + NODE_SIZE_WIDTH);
/* c8 ignore start */
if(str.length < baseIndex + entryLen) {
throw new Error('Parts of the encoded entry are missing');
}
/* c8 ignore end */

const headerEnd = baseIndex + ENTRY_HEADER_WIDTH;
const weight = decompressNumber(str, baseIndex + NODE_SIZE_WIDTH, headerEnd);
const content = str.substring(headerEnd, baseIndex + entryLen);

return {
key: keyFunction(content) as any, // needed due to special `SearchKey` type shenanigans in its definition.
content: content,
weight: weight
}
}


// BOTH node types:
// - totalLen: 2 chars (fixed position, size) - size of the encoded node.
// - weight: weight of the highest frequency entry within the node's sub-trie.
// - 2^32 ~= 4*10^9, representable in 2 chars... if it weren't for
// `"`-escaping.
//
// Next char: indicates BOTH a count of something (entries or direct children)
// and a high-bit indicating 'leaf' or 'internal'.
// - function of other bits will be indicated by their sections.

/**
* Length of the 'header' section for encoded leaf and internal nodes, representing the
* total size for their children/entries + the weight of the highest-frequency entry
* represented by each node's represented sub-trie.
*/
export const NODE_TYPE_INDEX = NODE_SIZE_WIDTH + WEIGHT_WIDTH;

export function compressNode(node: Node) {
let encodedSpecifics = node.type == 'leaf' ? compressLeaf(node) : compressInternal(node);
let weightEnc = compressNumber(node.weight, WEIGHT_WIDTH);
let charLength = encodedSpecifics.length + NODE_SIZE_WIDTH + WEIGHT_WIDTH;

return `${compressNumber(charLength, 2)}${weightEnc}${encodedSpecifics}`;
}

export function decompressNode(str: string, keyFunction: Wordform2Key, baseIndex: number) {
baseIndex ||= 0;

const entryLen = decompressNumber(str, baseIndex + 0, baseIndex + NODE_SIZE_WIDTH);
/* c8 ignore start */
if(str.length < baseIndex + entryLen) {
throw new Error('Parts of the encoded node are missing');
}
/* c8 ignore end */

const typeFlagSrc = decompressNumber(str, baseIndex + NODE_TYPE_INDEX, baseIndex + NODE_TYPE_INDEX + 1);
const isLeafType = typeFlagSrc & 0x8000;

return isLeafType ? decompressLeaf(str, keyFunction, baseIndex) : decompressInternal(str, baseIndex);
}

// encoded LEAF:
// - BOTH-section header
// - entriesCnt: 1 char (fixed position)
// - type flag overlaps here - high bit of the representing char should be one.
// - entries: full encoding of all contained entries.

function compressLeaf(leaf: Leaf): string {
const entries = leaf.entries;

// key, content, weight - per entry
const entryCntAndType = entries.length | 0x8000;
/* c8 ignore start */
if(entries.length >= 0x8000) {
throw new Error("Cannot encode leaf: too many direct entries");
}
/* c8 ignore end */
let compressedEntries = [compressNumber(entryCntAndType)].concat(entries.map((entry) => {
// if already compressed, no need to recompress it.
return typeof entry == 'string' ? entry : compressEntry(entry);
}));

return compressedEntries.join('');
}

function decompressLeaf(str: string, keyFunction: Wordform2Key, baseIndex: number): Leaf {
const weight = decompressNumber(str, baseIndex + NODE_SIZE_WIDTH, baseIndex + NODE_SIZE_WIDTH + WEIGHT_WIDTH);

// Assumes string-subsection size check has passed.
const entryCntSrc = decompressNumber(str, baseIndex + NODE_TYPE_INDEX, baseIndex + NODE_TYPE_INDEX + 1);
// Remove the type-flag bit indicating 'leaf node'.
const entryCnt = entryCntSrc & 0x7FFF;

let entries: Entry[] = [];
baseIndex = baseIndex + NODE_TYPE_INDEX + 1;
for(let i = 0; i < entryCnt; i++) {
const entryWidth = decompressNumber(str, baseIndex, baseIndex+NODE_SIZE_WIDTH);
const nextIndex = baseIndex + entryWidth;
entries.push(decompressEntry(str, keyFunction, baseIndex));
baseIndex = nextIndex;
}

return {
type: 'leaf',
weight: weight,

// To consider: is it better to just make a 'lazy span' against the original string?
// - would use more memory, especially once a Trie is "mostly" decompressed
// - current approach 'discards' decoded parts of the original string.
// - would likely decompress a bit faster.
entries: entries
}
}

// encoded INTERNAL:
// - BOTH-section header
// - valLen: 1 char (fixed position, size) - we shouldn't ever have an array of > 65000, right?
// - is also the count for children.
// - type flag overlaps here - high bit of the representing char should be zero.
// - values: string
// - ezpz - they're already single-char strings.
// - children: full encoding of next-layer nodes. Same order as the entries are found within `values`.
// - that is, no need to insert keys.
// - first two bits: length of following bits... so can use that to calculate offset for next entry's
// encoding start.

function compressInternal(node: InternalNode): string {
let values = node.values;
const valueCntAndType = values.length;
/* c8 ignore start */
if(valueCntAndType >= 0x8000) {
throw new Error("Cannot encode node: too many direct children");
}
/* c8 ignore end */

const compressedChildren = values.map((value) => {
// In case of regression for #11073
if(value === null) {
value = "undefined"; // yes, really.
}
const child = node.children[value];

/* c8 ignore start */
if(!child) {
throw new Error("unexpected empty reference for child");
}
/* c8 ignore end */

// No need to recompress it if it's already compressed.
return typeof child == 'string' ? child : compressNode(child);
});

// Properly fix the sentinel-value issue.
// Also of note: this array may contain halves of surrogate pairs.
values = values.map((value) => value === null ? '\ufdd0' : value);

const totalArr = [compressNumber(valueCntAndType)].concat(values).concat(compressedChildren);
return totalArr.join('');
}

function decompressInternal(str: string, baseIndex: number): Omit<InternalNode, 'children'> & { children: {[char: string]: string} } {
const weight = decompressNumber(str, baseIndex + NODE_SIZE_WIDTH, baseIndex + NODE_SIZE_WIDTH + WEIGHT_WIDTH);

// Assumes string-subsection size check has passed.
const childCnt = decompressNumber(str, baseIndex + NODE_TYPE_INDEX, baseIndex + NODE_TYPE_INDEX + 1);

baseIndex = baseIndex + NODE_TYPE_INDEX + 1;
let nextIndex = baseIndex + childCnt;
const values = str.substring(baseIndex, nextIndex).split('');
baseIndex = nextIndex;

let compressedChildren: {[char: string]: string} = {};
for(let i = 0; i < childCnt; i++) {
const childWidth = decompressNumber(str, baseIndex, baseIndex+NODE_SIZE_WIDTH);
nextIndex = baseIndex + childWidth;
compressedChildren[values[i]] = str.substring(baseIndex, nextIndex);
baseIndex = nextIndex;
}

return {
type: 'internal',
weight: weight,
values: values,
// To consider: is it better to just make a 'lazy span' against the original string?
// - would use more memory, especially once a Trie is "mostly" decompressed
// - would likely decompress a bit faster.
// - would not be wise if we ever wish to have live editing of a loaded Trie; it's only
// a useful strategy if the backing data is static.
children: compressedChildren
}
}
21 changes: 10 additions & 11 deletions common/models/templates/src/trie.ts
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ export class TrieTraversal implements LexiconTraversal {
* The current traversal node. Serves as the 'root' of its own sub-Trie,
* and we cannot navigate back to its parent.
*/
readonly root: Node;
root: Node;

/**
* The max weight for the Trie being 'traversed'. Needed for probability
Expand All @@ -108,18 +108,14 @@ export class TrieTraversal implements LexiconTraversal {
}

child(char: USVString): LexiconTraversal | undefined {
/*
Note: would otherwise return the current instance if `char == ''`. If
such a call is happening, it's probably indicative of an implementation
issue elsewhere - let's signal now in order to catch such stuff early.
*/
// May result for blank tokens resulting immediately after whitespace.
if(char == '') {
return undefined;
return this;
}

// Split into individual code units.
let steps = char.split('');
let traversal: ReturnType<TrieTraversal["_child"]> = this;
let traversal: TrieTraversal | undefined = this;

while(steps.length > 0 && traversal) {
const step: string = steps.shift()!;
Expand Down Expand Up @@ -162,6 +158,10 @@ export class TrieTraversal implements LexiconTraversal {

*children(): Generator<{char: USVString, traversal: () => LexiconTraversal}> {
let root = this.root;

// We refer to the field multiple times in this method, and it doesn't change.
// This also assists minification a bit, since we can't minify when re-accessing
// through `this.`.
const totalWeight = this.totalWeight;

// Sorts _just_ the current level, and only if needed.
Expand Down Expand Up @@ -238,11 +238,10 @@ export class TrieTraversal implements LexiconTraversal {
}

get entries() {
const totalWeight = this.totalWeight;
const entryMapper = function(value: Entry) {
const entryMapper = (value: Entry) => {
return {
text: value.content,
p: value.weight / totalWeight
p: value.weight / this.totalWeight
}
}

Expand Down

Large diffs are not rendered by default.

Loading

0 comments on commit 2093ef9

Please sign in to comment.