diff --git a/common/models/wordbreakers/src/main/default/index.ts b/common/models/wordbreakers/src/main/default/index.ts index 8df6774de2d..7771f1c9a66 100644 --- a/common/models/wordbreakers/src/main/default/index.ts +++ b/common/models/wordbreakers/src/main/default/index.ts @@ -1,4 +1,6 @@ -import { WordBreakProperty, WORD_BREAK_PROPERTY, I, propertyMap } from "./data.inc.js"; +import { WordBreakProperty, propertyMap } from "./data.inc.js"; + +import { searchForProperty } from "./searchForProperty.js"; /** * A set of options used to customize and extend the behavior of the default @@ -566,7 +568,7 @@ function property(character: string, options?: DefaultWordBreakerOptions): WordB // TODO: remove dependence on character.codepointAt()? let codepoint = character.codePointAt(0) as number; - return searchForProperty(codepoint, 0, WORD_BREAK_PROPERTY.length - 1); + return searchForProperty(codepoint); } function propertyVal(propName: string, options?: DefaultWordBreakerOptions) { @@ -574,35 +576,4 @@ function propertyVal(propName: string, options?: DefaultWordBreakerOptions) { const customIndex = options?.customProperties?.findIndex(matcher) ?? -1; return customIndex != -1 ? -customIndex - 1 : propertyMap.findIndex(matcher); -} - -/** - * Binary search for the word break property of a given CODE POINT. - * - * The auto-generated data.ts master array defines a **character range** - * lookup table. If a character's codepoint is equal to or greater than - * the I.Start value for an entry and exclusively less than the next entry, - * it falls in the first entry's range bucket and is classified accordingly - * by this method. - */ -function searchForProperty(codePoint: number, left: number, right: number): WordBreakProperty { - // All items that are not found in the array are assigned the 'Other' property. - if (right < left) { - return WordBreakProperty.Other; - } - - let midpoint = left + ~~((right - left) / 2); - let candidate = WORD_BREAK_PROPERTY[midpoint]; - - let nextRange = WORD_BREAK_PROPERTY[midpoint + 1]; - let startOfNextRange = nextRange ? nextRange[I.Start] : Infinity; - - if (codePoint < candidate[I.Start]) { - return searchForProperty(codePoint, left, midpoint - 1); - } else if (codePoint >= startOfNextRange) { - return searchForProperty(codePoint, midpoint + 1, right); - } - - // We found it! - return candidate[I.Value]; -} +} \ No newline at end of file diff --git a/common/models/wordbreakers/src/main/default/searchForProperty.ts b/common/models/wordbreakers/src/main/default/searchForProperty.ts new file mode 100644 index 00000000000..78f1b5f2680 --- /dev/null +++ b/common/models/wordbreakers/src/main/default/searchForProperty.ts @@ -0,0 +1,43 @@ +import { WordBreakProperty, WORD_BREAK_PROPERTY_BMP, WORD_BREAK_PROPERTY_NON_BMP } from "./data.inc.js"; + +export function searchForProperty(codePoint: number): WordBreakProperty { + const bucketSize = codePoint <= 0xFFFF ? 2 : 3; + + // SMP chars take a bit more space to encode. + const encodedArray = bucketSize == 2 ? WORD_BREAK_PROPERTY_BMP : WORD_BREAK_PROPERTY_NON_BMP; + + return _searchForProperty(encodedArray, codePoint, bucketSize, 0, encodedArray.length / bucketSize - 1) - 0x20; +} + +/** + * Binary search for the word break property of a given CODE POINT. + * + * The auto-generated data.ts master strings encode **character range** + * lookup tables. If a character's codepoint is equal to or greater than + * the start-of-range value for an entry and exclusively less than the next + * entry's start-of-range, it falls within the first entry's range bucket + * and is classified accordingly by this method. + */ +function _searchForProperty(encodedArray: string, codePoint: number, bucketSize: number, left: number, right: number): WordBreakProperty { + // All items that are not found in the array are assigned the 'Other' property. + if (right < left) { // May need special handling at end of BMP / start of non-BMP. + return WordBreakProperty.Other; + } + + let midpoint = left + ~~((right - left) / 2); + let candidate = encodedArray.codePointAt(bucketSize * midpoint); + + // If out-of-bounds, gives NaN. + let nextRange = encodedArray.codePointAt(bucketSize * (midpoint + 1)); + let startOfNextRange = isNaN(nextRange) ? Infinity : nextRange; + + if (codePoint < candidate) { + return _searchForProperty(encodedArray, codePoint, bucketSize, left, midpoint - 1); + } else if (codePoint >= startOfNextRange) { + return _searchForProperty(encodedArray, codePoint, bucketSize, midpoint + 1, right); + } + + // We found it! + const propertyCode = encodedArray.charCodeAt(bucketSize * (midpoint + 1) - 1); + return propertyCode as WordBreakProperty; +} \ No newline at end of file diff --git a/common/models/wordbreakers/test/test-search-property.js b/common/models/wordbreakers/test/test-search-property.js new file mode 100644 index 00000000000..85996c7286e --- /dev/null +++ b/common/models/wordbreakers/test/test-search-property.js @@ -0,0 +1,33 @@ +/** + * Smoke-test the default + */ + +import { assert } from 'chai'; +import { searchForProperty } from '../build/obj/default/searchForProperty.js'; +import { propertyMap } from '../build/obj/default/data.inc.js'; + +describe('searchForProperty', () => { + it('correctly finds character classes for standard ASCII characters', () => { + assert.equal(searchForProperty('a'.codePointAt(0)), propertyMap.indexOf('ALetter')); + assert.equal(searchForProperty('Z'.codePointAt(0)), propertyMap.indexOf('ALetter')); + + assert.equal(searchForProperty("'".codePointAt(0)), propertyMap.indexOf('Single_Quote')); + assert.equal(searchForProperty('"'.codePointAt(0)), propertyMap.indexOf('Double_Quote')); + assert.equal(searchForProperty(','.codePointAt(0)), propertyMap.indexOf('MidNum')); + assert.equal(searchForProperty('.'.codePointAt(0)), propertyMap.indexOf('MidNumLet')); + assert.equal(searchForProperty('-'.codePointAt(0)), propertyMap.indexOf('Other')); + }); + + it('correctly finds character classes for specialized BMP characters', () => { + assert.equal(searchForProperty(0x05D0), propertyMap.indexOf('Hebrew_Letter')); + assert.equal(searchForProperty(0x3031), propertyMap.indexOf('Katakana')); + assert.equal(searchForProperty(0xFFFE), propertyMap.indexOf('Other')); + assert.equal(searchForProperty(0xFFFF), propertyMap.indexOf('Other')); + }); + + it('correctly finds character classes for non-BMP characters', () => { + assert.equal(searchForProperty(0x0001F1E6), propertyMap.indexOf('Regional_Indicator')); + assert.equal(searchForProperty(0x00013430), propertyMap.indexOf('Format')); + assert.equal(searchForProperty(0x00010000), propertyMap.indexOf('ALetter')); + }); +}); \ No newline at end of file diff --git a/common/models/wordbreakers/tools/data-compiler/index.ts b/common/models/wordbreakers/tools/data-compiler/index.ts index d2effa58562..8f3c41bad37 100644 --- a/common/models/wordbreakers/tools/data-compiler/index.ts +++ b/common/models/wordbreakers/tools/data-compiler/index.ts @@ -2,8 +2,6 @@ // Original version found at: https://github.com/eddieantonio/unicode-default-word-boundary/blob/master/libexec/compile-word-break.js -// TODO: Adapt to produce two string-encoded arrays - one for BMP chars, one for non-BMP chars. - import fs from 'fs'; import path from 'path'; @@ -93,6 +91,39 @@ const categoryMap = new Map(); for(let cat of categories) { categoryMap.set(cat, catIndexSeed++); + if(catIndexSeed == '`'.charCodeAt(0)) { + catIndexSeed++; // Skip the back-tick as an encoding symbol. + // Reduces complications, as it's the encoding string start/end char. + } +} + +const bmpRanges: typeof ranges = []; +const nonBmpRanges: typeof ranges = []; + +// { start: number, property: number}[] +for(let range of ranges) { // already sorted + if(range.start <= 0xFFFF) { + bmpRanges.push(range); + } else { + if(nonBmpRanges.length == 0) { + const finalBmpRange = bmpRanges[bmpRanges.length - 1]; + bmpRanges.push({ + start: 0xFFFF, + property: finalBmpRange.property, + end: undefined + }); + + if(range.start != 0x10000) { + nonBmpRanges.push({ + start: 0x10000, + property: finalBmpRange.property, + end: undefined + }); + } + } + + nonBmpRanges.push(range); + } } //////////////////////// Creating the generated file ///////////////////////// @@ -100,6 +131,14 @@ for(let cat of categories) { // Save the output in the gen/ directory. let stream = fs.createWriteStream(generatedFilename); +function escape(codedChar: string) { + if(codedChar == '`' || codedChar == '\\') { + return '\\' + codedChar; + } else { + return codedChar; + } +} + // // Former entry in the original version by Eddie that was never included in our repo: // export const extendedPictographic = ${extendedPictographicRegExp}; @@ -116,7 +155,7 @@ stream.write(`// Automatically generated file. DO NOT MODIFY. export const enum WordBreakProperty { ${ /* Create enum values for each word break property */ Array.from(categories) - .map(x => ` ${x}`) + .map(x => ` ${x} = ${categoryMap.get(x)}`) .join(',\n') } }; @@ -133,38 +172,29 @@ ${ /* Enumerate the plain-text names for ease of lookup at runtime */ } ]; -/** - * Constants for indexing values in WORD_BREAK_PROPERTY. - */ -export const enum I { - Start = 0, - Value = 1 -} - -/** - * Defines a mapping of all characters to their assigned word-breaking - * property type. - * - * There are implicit buckets starting at the char with specified code \`number\` - * of an entry up to, but not including, the value in the next entry. All - * entries in each bucket share the same property value. - * - * Consider the following two consecutive buckets: - * - [0x0041, WordBreakProperty.ALetter] - * - [0x005B, WordBreakProperty.Other] - * - * For this example, all characters from 0x0041 to 0x005B (that is, 'A'-'Z') - * have the wordbreaking property \`ALetter\`. - */ -export const WORD_BREAK_PROPERTY: [number, WordBreakProperty][] = [ -${ - // TODO: Two versions: one that's BMP-encoded, one that's non-BMP encoded. - ranges.map(({start, property}) => (` [` + - `/*start*/ 0x${start.toString(16).toUpperCase()}, ` + - `WordBreakProperty.${property}],` - )).join('\n') -} -]; +export const WORD_BREAK_PROPERTY_BMP: string = \`${ + // To consider: emit `\uxxxx` codes instead of the raw char? + bmpRanges.map(({start, property}) => { + let codedStart = escape(String.fromCodePoint(start)); + + // Offset the encoded property value to lie within a friendlier range, + // with characters that render naturally within code editors. + const codedProp = escape(String.fromCharCode(categoryMap.get(property) + 0x20)); + return `${codedStart}${codedProp}`; + }).join('') +}\`; + +export const WORD_BREAK_PROPERTY_NON_BMP: string = \`${ + // To consider: emit `\uxxxx` codes instead of the raw char? + nonBmpRanges.map(({start, property}) => { + const codedStart = escape(String.fromCodePoint(start)); + + // Offset the encoded property value to lie within a friendlier range, + // with characters that render naturally within code editors. + const codedProp = escape(String.fromCharCode(categoryMap.get(property) + 0x20)); + return `${codedStart}${codedProp}`; + }).join('') +}\`; `); /**