Merge pull request #12229 from keymanapp/fix/web/mitigate-malformed-w…

…ordbreaking fix(web): improve tokenization output when wordbreaker breaks spec for span properties in output
keymanapp · Aug 21, 2024 · 28ccfe1 · 28ccfe1
2 parents a98ee70 + ec32e21
commit 28ccfe1
Show file tree

Hide file tree

Showing 3 changed files with 226 additions and 12 deletions.
diff --git a/common/models/templates/src/tokenization.ts b/common/models/templates/src/tokenization.ts
@@ -61,20 +61,21 @@ export function tokenize(
   let currentIndex = 0;
   while(leftSpans.length > 0) {
     const nextSpan = leftSpans[0];
-    if(nextSpan.start != currentIndex) {
+    if(Math.max(nextSpan.start, currentIndex) != currentIndex) {
+      const nextIndex = Math.max(currentIndex, nextSpan.start);
       // Implicit whitespace span!
       tokenization.left.push({
-        text: context.left!.substring(currentIndex, nextSpan.start),
+        text: context.left!.substring(currentIndex, nextIndex),
         isWhitespace: true
       });
-      currentIndex = nextSpan.start;
+      currentIndex = nextIndex;
     } else {
       leftSpans.shift();
       // Explicit non-whitespace span.
       tokenization.left.push({
         text: nextSpan.text
       });
-      currentIndex = nextSpan.end;
+      currentIndex = Math.max(currentIndex, nextSpan.end);
     }
   }
 
@@ -84,11 +85,12 @@ export function tokenize(
   // Note:  the default wordbreaker won't need this code, as it emits a `''`
   // after final whitespace.
   if(context.left != null && currentIndex != context.left.length) {
+    const nextIndex = Math.max(currentIndex, context.left!.length);
     tokenization.left.push({
-      text: context.left.substring(currentIndex, context.left!.length),
+      text: context.left.substring(currentIndex, nextIndex),
       isWhitespace: true
     });
-    currentIndex = context.left!.length;
+    currentIndex = nextIndex;
   }
 
   // New step 2: handle any rejoins needed.
@@ -134,13 +136,14 @@ export function tokenize(
   // `caretSplitsToken` check is additional.
   while(rightSpans.length > 0) {
     const nextSpan = rightSpans[0];
-    if(nextSpan.start != currentIndex) {
+    if(Math.max(nextSpan.start, currentIndex) != currentIndex) {
+      const nextIndex = Math.max(currentIndex, nextSpan.start);
       // Implicit whitespace span!
       tokenization.right.push({
-        text: context.right!.substring(currentIndex, nextSpan.start),
+        text: context.right!.substring(currentIndex, nextIndex),
         isWhitespace: true
       });
-      currentIndex = nextSpan.start;
+      currentIndex = nextIndex;
     } else {
       const leftTail = tokenization.left[leftTokenCount-1];
       if(leftTail) {
@@ -159,7 +162,7 @@ export function tokenize(
       tokenization.right.push({
         text: nextSpan.text
       });
-      currentIndex = nextSpan.end;
+      currentIndex = Math.max(currentIndex, nextSpan.end);
     }
 
     // We've always processed the "first right token" after the first iteration.
@@ -176,11 +179,12 @@ export function tokenize(
   // Also note:  is pretty much WET with the similar check after the
   // leftSpan loop.
   if(context.right && currentIndex != context.right.length) {
+    const nextIndex = Math.max(currentIndex, context.right!.length);
     tokenization.right.push({
-      text: context.right.substring(currentIndex, context.right!.length),
+      text: context.right.substring(currentIndex, nextIndex),
       isWhitespace: true
     });
-    currentIndex = context.right!.length;
+    currentIndex = nextIndex;
   }
 
   return tokenization;

diff --git a/common/models/templates/test/custom-breakers.def.js b/common/models/templates/test/custom-breakers.def.js
@@ -0,0 +1,128 @@
+/**
+ * The custom wordbreaker used by the sil.km.gcc model as of
+ * https://github.com/keymanapp/lexical-models/pull/265.
+ * @type {WordBreakingFunction}
+ * */
+export function customWordBreakerProper(str) {
+  const whitespaceRegex = /\s|\u200b|\n|\r/;
+  const tokens = str.split(whitespaceRegex);
+
+  for(let i=0; i < tokens.length; i++) {
+    const token = tokens[i];
+    if(token.length == 0) {
+      tokens.splice(i, 1);
+      i--;
+      continue;
+    } else if(token.length == 1 && whitespaceRegex.test(token)) {
+      tokens.splice(i, 1);
+      i--;
+      continue;
+    }
+
+    // Certain punctuation marks should be considered a separate token from the word they're next to.
+    const punctuationMarks = ['«', '»', '$', '#' /* add extras here */];
+    const punctSplitIndices = [];
+
+    // Find if and where each mark exists within the token
+    for(let i = 0; i < punctuationMarks.length; i++) {
+      const split = token.indexOf(punctuationMarks[i]);
+      if(split >= 0) {
+        punctSplitIndices.push(split);
+      }
+    }
+
+    // Sort and pick the earliest mark's location.  If none exists, use -1.
+    punctSplitIndices.sort();
+    const splitPoint = punctSplitIndices[0] === undefined ? -1 : punctSplitIndices[0];
+
+    if(splitPoint > -1) {
+      const left = token.substring(0, splitPoint);  // (0, -1) => ''
+      const punct = token.substring(splitPoint, splitPoint+1);
+      const right = token.substring(splitPoint+1);  // Starting past the end of the string => ''
+
+      if(left) {
+        tokens.splice(i++, 0, left);
+      }
+      tokens.splice(i++, 1, punct);
+      if(right) {
+        tokens.splice(i, 0, right);
+      }
+      // Ensure that the next iteration puts `i` immediately after the punctuation token... even if
+      // there was a `right` portion, as it may have extra marks that also need to be spun off.
+      i--;
+    }
+  }
+
+  let latestIndex = 0;
+  return tokens.map(function(token) {
+    const start = str.indexOf(token, latestIndex);
+    latestIndex = start + token.length;
+    return {
+      left: start,
+      start: start,
+      right: start + token.length,
+      end: start + token.length,
+      length: token.length,
+      text: token
+    }
+  });
+}
+
+/**
+ * The version of the custom wordbreaker used by the sil.km.gcc model
+ * before https://github.com/keymanapp/lexical-models/pull/265, which
+ * triggered #12200.
+ * @type {WordBreakingFunction}
+ * */
+export function customWordBreakerFormer (str) {
+  const tokens = str.split(/\s|\u200b/);
+
+  for(let i=0; i < tokens.length; i++) {
+    const token = tokens[i];
+    if(token.length == 1) {
+      continue;
+    }
+
+    // Certain punctuation marks should be considered a separate token from the word they're next to.
+    const punctuationMarks = ['«', '»', '$', '#' /* add extras here */];
+    const punctSplitIndices = [];
+
+    // Find if and where each mark exists within the token
+    for(let i = 0; i < punctuationMarks.length; i++) {
+      const split = token.indexOf(punctuationMarks[i]);
+      if(split >= 0) {
+        punctSplitIndices.push(split);
+      }
+    }
+
+    // Sort and pick the earliest mark's location.  If none exists, use -1.
+    punctSplitIndices.sort();
+    const splitPoint = punctSplitIndices[0] === undefined ? -1 : punctSplitIndices[0];
+
+    if(splitPoint > -1) {
+      const left = token.substring(0, splitPoint);  // (0, -1) => ''
+      const punct = token.substring(splitPoint, splitPoint+1);
+      const right = token.substring(splitPoint+1);  // Starting past the end of the string => ''
+
+      if(left) {
+        tokens.splice(i++, 0, left);
+      }
+      tokens.splice(i++, 1, punct);
+      if(right) {
+        tokens.splice(i, 0, right);
+      }
+      // Ensure that the next iteration puts `i` immediately after the punctuation token... even if
+      // there was a `right` portion, as it may have extra marks that also need to be spun off.
+      i--;
+    }
+  }
+  return tokens.map(function(token) {
+    return {
+      left: str.indexOf(token),
+      start: str.indexOf(token),
+      right: str.indexOf(token) + token.length,
+      end: str.indexOf(token) + token.length,
+      text: token
+    }
+  });
+}
diff --git a/common/models/templates/test/test-tokenization.js b/common/models/templates/test/test-tokenization.js
@@ -5,6 +5,7 @@
 import { assert } from 'chai';
 import * as models from "@keymanapp/models-templates";
 import * as wordBreakers from "@keymanapp/models-wordbreakers";
+import { customWordBreakerFormer, customWordBreakerProper } from './custom-breakers.def.js';
 
 function asProcessedToken(text) {
   // default wordbreaker emits these at the end of each context half if ending with whitespace.
@@ -490,6 +491,87 @@ describe('Tokenization functions', function() {
         caretSplitsToken: true
       });
     });
+
+    it('mitigates effects of previously-distributed malformed wordbreaker output', function () {
+      const text = 'the  quick brown fox      jumped over the lazy dog  ';
+      /** @type { Context } */
+      const context = {
+        left: text,
+        right: '',
+        startOfBuffer: true,
+        endOfBuffer: true
+      }
+
+      const tokenized = models.tokenize(customWordBreakerFormer, context);
+
+      // Mitigation aims to prevent the _worst_ side-effects that can result from invalidating the
+      // underlying assumption of a monotonically-increasing index within the context -
+      // assigning repeated or blank entries the text that preceded them!
+      assert.notExists(tokenized.left.find((token) => token.text == text));
+      assert.notExists(tokenized.left.find((token) => token.text.startsWith(text.substring(0, 25))));
+
+      // 'the' appears twice in the context, which should result in two separate 'the' tokens here.
+      // This was improperly handled when we didn't check that assumption.
+      assert.equal(tokenized.left.filter((token) => token.text == 'the').length, 2);
+
+      // Does not address multiple blank-token ('') entries that result from intervening spaces;
+      // that would add too much extra complexity to the method... and it can already be
+      // handled decently by the predictive-text engine.
+      assert.deepEqual(
+        tokenized.left
+          .filter((entry) => !entry.isWhitespace)
+          .filter((entry) => entry.text != '')
+          .map((entry) => entry.text),
+        ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']
+      );
+    });
+
+    it('properly works with well-formed custom wordbreaker output', function () {
+      const text = 'the  quick brown fox      jumped over the lazy dog  ';
+      /** @type { Context } */
+      const context = {
+        left: text,
+        right: '',
+        startOfBuffer: true,
+        endOfBuffer: true
+      }
+
+      const tokenized = models.tokenize(customWordBreakerProper, context);
+
+      // Easier-to-parse version
+      assert.deepEqual(
+        tokenized.left
+          .filter((entry) => !entry.isWhitespace)
+          .map((entry) => entry.text),
+        ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']
+      );
+
+      // This time, with whitespaces.
+      assert.deepEqual(
+        tokenized.left.map((entry) => entry.text), [
+          'the',
+          '  ',
+          'quick',
+          ' ',
+          'brown',
+          ' ',
+          'fox',
+          '      ',
+          'jumped',
+          ' ',
+          'over',
+          ' ',
+          'the',
+          ' ',
+          'lazy',
+          ' ',
+          'dog',
+          '  '
+        ]
+      );
+    });
+
+    //
   });
 
   describe('getLastPreCaretToken', function() {