Merge pull request #10692 from keymanapp/feat/web/wordbreaker-data-op…

…timization feat(web): optimize the wordbreaker data table for filesize and ease of first-load parsing ⚡
keymanapp · Aug 27, 2024 · 957412a · 957412a
2 parents 3e89ff8 + 4159bab
commit 957412a
Show file tree

Hide file tree

Showing 4 changed files with 146 additions and 69 deletions.
diff --git a/common/models/wordbreakers/src/main/default/index.ts b/common/models/wordbreakers/src/main/default/index.ts
@@ -1,4 +1,6 @@
-import { WordBreakProperty, WORD_BREAK_PROPERTY, I, propertyMap } from "./data.inc.js";
+import { WordBreakProperty, propertyMap } from "./data.inc.js";
+
+import { searchForProperty } from "./searchForProperty.js";
 
 /**
  * A set of options used to customize and extend the behavior of the default
@@ -566,43 +568,12 @@ function property(character: string, options?: DefaultWordBreakerOptions): WordB
   // TODO: remove dependence on character.codepointAt()?
   let codepoint = character.codePointAt(0) as number;
 
-  return searchForProperty(codepoint, 0, WORD_BREAK_PROPERTY.length - 1);
+  return searchForProperty(codepoint);
 }
 
 function propertyVal(propName: string, options?: DefaultWordBreakerOptions) {
   const matcher = (name: string) => name.toLowerCase() == propName.toLowerCase()
 
   const customIndex = options?.customProperties?.findIndex(matcher) ?? -1;
   return customIndex != -1 ? -customIndex - 1 : propertyMap.findIndex(matcher);
-}
-
-/**
- * Binary search for the word break property of a given CODE POINT.
- *
- * The auto-generated data.ts master array defines a **character range**
- * lookup table.  If a character's codepoint is equal to or greater than
- * the I.Start value for an entry and exclusively less than the next entry,
- * it falls in the first entry's range bucket and is classified accordingly
- * by this method.
- */
-function searchForProperty(codePoint: number, left: number, right: number): WordBreakProperty {
-  // All items that are not found in the array are assigned the 'Other' property.
-  if (right < left) {
-    return WordBreakProperty.Other;
-  }
-
-  let midpoint = left + ~~((right - left) / 2);
-  let candidate = WORD_BREAK_PROPERTY[midpoint];
-
-  let nextRange = WORD_BREAK_PROPERTY[midpoint + 1];
-  let startOfNextRange = nextRange ? nextRange[I.Start] : Infinity;
-
-  if (codePoint < candidate[I.Start]) {
-    return searchForProperty(codePoint, left, midpoint - 1);
-  } else if (codePoint >= startOfNextRange) {
-    return searchForProperty(codePoint, midpoint + 1, right);
-  }
-
-  // We found it!
-  return candidate[I.Value];
-}
+}
diff --git a/common/models/wordbreakers/src/main/default/searchForProperty.ts b/common/models/wordbreakers/src/main/default/searchForProperty.ts
@@ -0,0 +1,43 @@
+import { WordBreakProperty, WORD_BREAK_PROPERTY_BMP, WORD_BREAK_PROPERTY_NON_BMP } from "./data.inc.js";
+
+export function searchForProperty(codePoint: number): WordBreakProperty {
+  const bucketSize = codePoint <= 0xFFFF ? 2 : 3;
+
+  // SMP chars take a bit more space to encode.
+  const encodedArray = bucketSize == 2 ? WORD_BREAK_PROPERTY_BMP : WORD_BREAK_PROPERTY_NON_BMP;
+
+  return _searchForProperty(encodedArray, codePoint, bucketSize, 0, encodedArray.length / bucketSize - 1) - 0x20;
+}
+
+/**
+ * Binary search for the word break property of a given CODE POINT.
+ *
+ * The auto-generated data.ts master strings encode **character range**
+ * lookup tables.  If a character's codepoint is equal to or greater than
+ * the start-of-range value for an entry and exclusively less than the next
+ * entry's start-of-range, it falls within the first entry's range bucket
+ * and is classified accordingly by this method.
+ */
+function _searchForProperty(encodedArray: string, codePoint: number, bucketSize: number, left: number, right: number): WordBreakProperty {
+  // All items that are not found in the array are assigned the 'Other' property.
+  if (right < left) {  // May need special handling at end of BMP / start of non-BMP.
+    return WordBreakProperty.Other;
+  }
+
+  let midpoint = left + ~~((right - left) / 2);
+  let candidate = encodedArray.codePointAt(bucketSize * midpoint);
+
+  // If out-of-bounds, gives NaN.
+  let nextRange = encodedArray.codePointAt(bucketSize * (midpoint + 1));
+  let startOfNextRange = isNaN(nextRange) ? Infinity : nextRange;
+
+  if (codePoint < candidate) {
+    return _searchForProperty(encodedArray, codePoint, bucketSize, left, midpoint - 1);
+  } else if (codePoint >= startOfNextRange) {
+    return _searchForProperty(encodedArray, codePoint, bucketSize, midpoint + 1, right);
+  }
+
+  // We found it!
+  const propertyCode = encodedArray.charCodeAt(bucketSize * (midpoint + 1) - 1);
+  return propertyCode as WordBreakProperty;
+}
diff --git a/common/models/wordbreakers/test/test-search-property.js b/common/models/wordbreakers/test/test-search-property.js
@@ -0,0 +1,33 @@
+/**
+ * Smoke-test the default
+ */
+
+import { assert } from 'chai';
+import { searchForProperty } from '../build/obj/default/searchForProperty.js';
+import { propertyMap } from '../build/obj/default/data.inc.js';
+
+describe('searchForProperty', () => {
+  it('correctly finds character classes for standard ASCII characters', () => {
+    assert.equal(searchForProperty('a'.codePointAt(0)), propertyMap.indexOf('ALetter'));
+    assert.equal(searchForProperty('Z'.codePointAt(0)), propertyMap.indexOf('ALetter'));
+
+    assert.equal(searchForProperty("'".codePointAt(0)), propertyMap.indexOf('Single_Quote'));
+    assert.equal(searchForProperty('"'.codePointAt(0)), propertyMap.indexOf('Double_Quote'));
+    assert.equal(searchForProperty(','.codePointAt(0)), propertyMap.indexOf('MidNum'));
+    assert.equal(searchForProperty('.'.codePointAt(0)), propertyMap.indexOf('MidNumLet'));
+    assert.equal(searchForProperty('-'.codePointAt(0)), propertyMap.indexOf('Other'));
+  });
+
+  it('correctly finds character classes for specialized BMP characters', () => {
+    assert.equal(searchForProperty(0x05D0), propertyMap.indexOf('Hebrew_Letter'));
+    assert.equal(searchForProperty(0x3031), propertyMap.indexOf('Katakana'));
+    assert.equal(searchForProperty(0xFFFE), propertyMap.indexOf('Other'));
+    assert.equal(searchForProperty(0xFFFF), propertyMap.indexOf('Other'));
+  });
+
+  it('correctly finds character classes for non-BMP characters', () => {
+    assert.equal(searchForProperty(0x0001F1E6), propertyMap.indexOf('Regional_Indicator'));
+    assert.equal(searchForProperty(0x00013430), propertyMap.indexOf('Format'));
+    assert.equal(searchForProperty(0x00010000), propertyMap.indexOf('ALetter'));
+  });
+});
diff --git a/common/models/wordbreakers/tools/data-compiler/index.ts b/common/models/wordbreakers/tools/data-compiler/index.ts
@@ -2,8 +2,6 @@
 
 // Original version found at: https://github.com/eddieantonio/unicode-default-word-boundary/blob/master/libexec/compile-word-break.js
 
-// TODO:  Adapt to produce two string-encoded arrays - one for BMP chars, one for non-BMP chars.
-
 import fs from 'fs';
 import path from 'path';
 
@@ -93,13 +91,54 @@ const categoryMap = new Map<string, number>();
 
 for(let cat of categories) {
   categoryMap.set(cat, catIndexSeed++);
+  if(catIndexSeed == '`'.charCodeAt(0)) {
+    catIndexSeed++; // Skip the back-tick as an encoding symbol.
+                    // Reduces complications, as it's the encoding string start/end char.
+  }
+}
+
+const bmpRanges: typeof ranges = [];
+const nonBmpRanges: typeof ranges = [];
+
+// { start: number, property: number}[]
+for(let range of ranges) { // already sorted
+  if(range.start <= 0xFFFF) {
+    bmpRanges.push(range);
+  } else {
+    if(nonBmpRanges.length == 0) {
+      const finalBmpRange = bmpRanges[bmpRanges.length - 1];
+      bmpRanges.push({
+        start: 0xFFFF,
+        property: finalBmpRange.property,
+        end: undefined
+      });
+
+      if(range.start != 0x10000) {
+        nonBmpRanges.push({
+          start: 0x10000,
+          property: finalBmpRange.property,
+          end: undefined
+        });
+      }
+    }
+
+    nonBmpRanges.push(range);
+  }
 }
 
 //////////////////////// Creating the generated file /////////////////////////
 
 // Save the output in the gen/ directory.
 let stream = fs.createWriteStream(generatedFilename);
 
+function escape(codedChar: string) {
+  if(codedChar == '`' || codedChar == '\\') {
+    return '\\' + codedChar;
+  } else {
+    return codedChar;
+  }
+}
+
 // // Former entry in the original version by Eddie that was never included in our repo:
 // export const extendedPictographic = ${extendedPictographicRegExp};
 
@@ -116,7 +155,7 @@ stream.write(`// Automatically generated file. DO NOT MODIFY.
 export const enum WordBreakProperty {
 ${ /* Create enum values for each word break property */
   Array.from(categories)
-    .map(x => `  ${x}`)
+    .map(x => `  ${x} = ${categoryMap.get(x)}`)
     .join(',\n')
 }
 };
@@ -133,38 +172,29 @@ ${ /* Enumerate the plain-text names for ease of lookup at runtime */
 }
 ];
 
-/**
- * Constants for indexing values in WORD_BREAK_PROPERTY.
- */
-export const enum I {
-  Start = 0,
-  Value = 1
-}
-
-/**
- * Defines a mapping of all characters to their assigned word-breaking
- * property type.
- *
- * There are implicit buckets starting at the char with specified code \`number\`
- * of an entry up to, but not including, the value in the next entry.  All
- * entries in each bucket share the same property value.
- *
- * Consider the following two consecutive buckets:
- * - [0x0041, WordBreakProperty.ALetter]
- * - [0x005B, WordBreakProperty.Other]
- *
- * For this example, all characters from 0x0041 to 0x005B (that is, 'A'-'Z')
- * have the wordbreaking property \`ALetter\`.
- */
-export const WORD_BREAK_PROPERTY: [number, WordBreakProperty][] = [
-${
-  // TODO:  Two versions:  one that's BMP-encoded, one that's non-BMP encoded.
-    ranges.map(({start, property}) => (`  [` +
-      `/*start*/ 0x${start.toString(16).toUpperCase()}, ` +
-      `WordBreakProperty.${property}],`
-    )).join('\n')
-}
-];
+export const WORD_BREAK_PROPERTY_BMP: string = \`${
+  // To consider:  emit `\uxxxx` codes instead of the raw char?
+  bmpRanges.map(({start, property}) => {
+    let codedStart = escape(String.fromCodePoint(start));
+
+    // Offset the encoded property value to lie within a friendlier range,
+    // with characters that render naturally within code editors.
+    const codedProp = escape(String.fromCharCode(categoryMap.get(property) + 0x20));
+    return `${codedStart}${codedProp}`;
+  }).join('')
+}\`;
+
+export const WORD_BREAK_PROPERTY_NON_BMP: string = \`${
+  // To consider:  emit `\uxxxx` codes instead of the raw char?
+  nonBmpRanges.map(({start, property}) => {
+    const codedStart = escape(String.fromCodePoint(start));
+
+    // Offset the encoded property value to lie within a friendlier range,
+    // with characters that render naturally within code editors.
+    const codedProp = escape(String.fromCharCode(categoryMap.get(property) + 0x20));
+    return `${codedStart}${codedProp}`;
+  }).join('')
+}\`;
 `);
 
 /**