Skip to content

Commit

Permalink
Merge pull request #10692 from keymanapp/feat/web/wordbreaker-data-op…
Browse files Browse the repository at this point in the history
…timization

feat(web): optimize the wordbreaker data table for filesize and ease of first-load parsing ⚡
  • Loading branch information
jahorton authored Aug 27, 2024
2 parents 3e89ff8 + 4159bab commit 957412a
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 69 deletions.
39 changes: 5 additions & 34 deletions common/models/wordbreakers/src/main/default/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import { WordBreakProperty, WORD_BREAK_PROPERTY, I, propertyMap } from "./data.inc.js";
import { WordBreakProperty, propertyMap } from "./data.inc.js";

import { searchForProperty } from "./searchForProperty.js";

/**
* A set of options used to customize and extend the behavior of the default
Expand Down Expand Up @@ -566,43 +568,12 @@ function property(character: string, options?: DefaultWordBreakerOptions): WordB
// TODO: remove dependence on character.codepointAt()?
let codepoint = character.codePointAt(0) as number;

return searchForProperty(codepoint, 0, WORD_BREAK_PROPERTY.length - 1);
return searchForProperty(codepoint);
}

function propertyVal(propName: string, options?: DefaultWordBreakerOptions) {
const matcher = (name: string) => name.toLowerCase() == propName.toLowerCase()

const customIndex = options?.customProperties?.findIndex(matcher) ?? -1;
return customIndex != -1 ? -customIndex - 1 : propertyMap.findIndex(matcher);
}

/**
* Binary search for the word break property of a given CODE POINT.
*
* The auto-generated data.ts master array defines a **character range**
* lookup table. If a character's codepoint is equal to or greater than
* the I.Start value for an entry and exclusively less than the next entry,
* it falls in the first entry's range bucket and is classified accordingly
* by this method.
*/
function searchForProperty(codePoint: number, left: number, right: number): WordBreakProperty {
// All items that are not found in the array are assigned the 'Other' property.
if (right < left) {
return WordBreakProperty.Other;
}

let midpoint = left + ~~((right - left) / 2);
let candidate = WORD_BREAK_PROPERTY[midpoint];

let nextRange = WORD_BREAK_PROPERTY[midpoint + 1];
let startOfNextRange = nextRange ? nextRange[I.Start] : Infinity;

if (codePoint < candidate[I.Start]) {
return searchForProperty(codePoint, left, midpoint - 1);
} else if (codePoint >= startOfNextRange) {
return searchForProperty(codePoint, midpoint + 1, right);
}

// We found it!
return candidate[I.Value];
}
}
43 changes: 43 additions & 0 deletions common/models/wordbreakers/src/main/default/searchForProperty.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import { WordBreakProperty, WORD_BREAK_PROPERTY_BMP, WORD_BREAK_PROPERTY_NON_BMP } from "./data.inc.js";

export function searchForProperty(codePoint: number): WordBreakProperty {
const bucketSize = codePoint <= 0xFFFF ? 2 : 3;

// SMP chars take a bit more space to encode.
const encodedArray = bucketSize == 2 ? WORD_BREAK_PROPERTY_BMP : WORD_BREAK_PROPERTY_NON_BMP;

return _searchForProperty(encodedArray, codePoint, bucketSize, 0, encodedArray.length / bucketSize - 1) - 0x20;
}

/**
* Binary search for the word break property of a given CODE POINT.
*
* The auto-generated data.ts master strings encode **character range**
* lookup tables. If a character's codepoint is equal to or greater than
* the start-of-range value for an entry and exclusively less than the next
* entry's start-of-range, it falls within the first entry's range bucket
* and is classified accordingly by this method.
*/
function _searchForProperty(encodedArray: string, codePoint: number, bucketSize: number, left: number, right: number): WordBreakProperty {
// All items that are not found in the array are assigned the 'Other' property.
if (right < left) { // May need special handling at end of BMP / start of non-BMP.
return WordBreakProperty.Other;
}

let midpoint = left + ~~((right - left) / 2);
let candidate = encodedArray.codePointAt(bucketSize * midpoint);

// If out-of-bounds, gives NaN.
let nextRange = encodedArray.codePointAt(bucketSize * (midpoint + 1));
let startOfNextRange = isNaN(nextRange) ? Infinity : nextRange;

if (codePoint < candidate) {
return _searchForProperty(encodedArray, codePoint, bucketSize, left, midpoint - 1);
} else if (codePoint >= startOfNextRange) {
return _searchForProperty(encodedArray, codePoint, bucketSize, midpoint + 1, right);
}

// We found it!
const propertyCode = encodedArray.charCodeAt(bucketSize * (midpoint + 1) - 1);
return propertyCode as WordBreakProperty;
}
33 changes: 33 additions & 0 deletions common/models/wordbreakers/test/test-search-property.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/**
* Smoke-test the default
*/

import { assert } from 'chai';
import { searchForProperty } from '../build/obj/default/searchForProperty.js';
import { propertyMap } from '../build/obj/default/data.inc.js';

describe('searchForProperty', () => {
it('correctly finds character classes for standard ASCII characters', () => {
assert.equal(searchForProperty('a'.codePointAt(0)), propertyMap.indexOf('ALetter'));
assert.equal(searchForProperty('Z'.codePointAt(0)), propertyMap.indexOf('ALetter'));

assert.equal(searchForProperty("'".codePointAt(0)), propertyMap.indexOf('Single_Quote'));
assert.equal(searchForProperty('"'.codePointAt(0)), propertyMap.indexOf('Double_Quote'));
assert.equal(searchForProperty(','.codePointAt(0)), propertyMap.indexOf('MidNum'));
assert.equal(searchForProperty('.'.codePointAt(0)), propertyMap.indexOf('MidNumLet'));
assert.equal(searchForProperty('-'.codePointAt(0)), propertyMap.indexOf('Other'));
});

it('correctly finds character classes for specialized BMP characters', () => {
assert.equal(searchForProperty(0x05D0), propertyMap.indexOf('Hebrew_Letter'));
assert.equal(searchForProperty(0x3031), propertyMap.indexOf('Katakana'));
assert.equal(searchForProperty(0xFFFE), propertyMap.indexOf('Other'));
assert.equal(searchForProperty(0xFFFF), propertyMap.indexOf('Other'));
});

it('correctly finds character classes for non-BMP characters', () => {
assert.equal(searchForProperty(0x0001F1E6), propertyMap.indexOf('Regional_Indicator'));
assert.equal(searchForProperty(0x00013430), propertyMap.indexOf('Format'));
assert.equal(searchForProperty(0x00010000), propertyMap.indexOf('ALetter'));
});
});
100 changes: 65 additions & 35 deletions common/models/wordbreakers/tools/data-compiler/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

// Original version found at: https://github.com/eddieantonio/unicode-default-word-boundary/blob/master/libexec/compile-word-break.js

// TODO: Adapt to produce two string-encoded arrays - one for BMP chars, one for non-BMP chars.

import fs from 'fs';
import path from 'path';

Expand Down Expand Up @@ -93,13 +91,54 @@ const categoryMap = new Map<string, number>();

for(let cat of categories) {
categoryMap.set(cat, catIndexSeed++);
if(catIndexSeed == '`'.charCodeAt(0)) {
catIndexSeed++; // Skip the back-tick as an encoding symbol.
// Reduces complications, as it's the encoding string start/end char.
}
}

const bmpRanges: typeof ranges = [];
const nonBmpRanges: typeof ranges = [];

// { start: number, property: number}[]
for(let range of ranges) { // already sorted
if(range.start <= 0xFFFF) {
bmpRanges.push(range);
} else {
if(nonBmpRanges.length == 0) {
const finalBmpRange = bmpRanges[bmpRanges.length - 1];
bmpRanges.push({
start: 0xFFFF,
property: finalBmpRange.property,
end: undefined
});

if(range.start != 0x10000) {
nonBmpRanges.push({
start: 0x10000,
property: finalBmpRange.property,
end: undefined
});
}
}

nonBmpRanges.push(range);
}
}

//////////////////////// Creating the generated file /////////////////////////

// Save the output in the gen/ directory.
let stream = fs.createWriteStream(generatedFilename);

function escape(codedChar: string) {
if(codedChar == '`' || codedChar == '\\') {
return '\\' + codedChar;
} else {
return codedChar;
}
}

// // Former entry in the original version by Eddie that was never included in our repo:
// export const extendedPictographic = ${extendedPictographicRegExp};

Expand All @@ -116,7 +155,7 @@ stream.write(`// Automatically generated file. DO NOT MODIFY.
export const enum WordBreakProperty {
${ /* Create enum values for each word break property */
Array.from(categories)
.map(x => ` ${x}`)
.map(x => ` ${x} = ${categoryMap.get(x)}`)
.join(',\n')
}
};
Expand All @@ -133,38 +172,29 @@ ${ /* Enumerate the plain-text names for ease of lookup at runtime */
}
];
/**
* Constants for indexing values in WORD_BREAK_PROPERTY.
*/
export const enum I {
Start = 0,
Value = 1
}
/**
* Defines a mapping of all characters to their assigned word-breaking
* property type.
*
* There are implicit buckets starting at the char with specified code \`number\`
* of an entry up to, but not including, the value in the next entry. All
* entries in each bucket share the same property value.
*
* Consider the following two consecutive buckets:
* - [0x0041, WordBreakProperty.ALetter]
* - [0x005B, WordBreakProperty.Other]
*
* For this example, all characters from 0x0041 to 0x005B (that is, 'A'-'Z')
* have the wordbreaking property \`ALetter\`.
*/
export const WORD_BREAK_PROPERTY: [number, WordBreakProperty][] = [
${
// TODO: Two versions: one that's BMP-encoded, one that's non-BMP encoded.
ranges.map(({start, property}) => (` [` +
`/*start*/ 0x${start.toString(16).toUpperCase()}, ` +
`WordBreakProperty.${property}],`
)).join('\n')
}
];
export const WORD_BREAK_PROPERTY_BMP: string = \`${
// To consider: emit `\uxxxx` codes instead of the raw char?
bmpRanges.map(({start, property}) => {
let codedStart = escape(String.fromCodePoint(start));
// Offset the encoded property value to lie within a friendlier range,
// with characters that render naturally within code editors.
const codedProp = escape(String.fromCharCode(categoryMap.get(property) + 0x20));
return `${codedStart}${codedProp}`;
}).join('')
}\`;
export const WORD_BREAK_PROPERTY_NON_BMP: string = \`${
// To consider: emit `\uxxxx` codes instead of the raw char?
nonBmpRanges.map(({start, property}) => {
const codedStart = escape(String.fromCodePoint(start));
// Offset the encoded property value to lie within a friendlier range,
// with characters that render naturally within code editors.
const codedProp = escape(String.fromCharCode(categoryMap.get(property) + 0x20));
return `${codedStart}${codedProp}`;
}).join('')
}\`;
`);

/**
Expand Down

0 comments on commit 957412a

Please sign in to comment.