From 0b79b86197a2ce335e01b299b634c6a25a2e971f Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Tue, 13 Feb 2024 08:26:27 +0700 Subject: [PATCH 1/7] feat(common/models): initial pass for encoded-string wordbreaker data tables --- .../wordbreakers/src/data-compiler/index.ts | 65 +++++++++++++++---- 1 file changed, 54 insertions(+), 11 deletions(-) diff --git a/common/models/wordbreakers/src/data-compiler/index.ts b/common/models/wordbreakers/src/data-compiler/index.ts index e217bc51f15..f639f670a84 100644 --- a/common/models/wordbreakers/src/data-compiler/index.ts +++ b/common/models/wordbreakers/src/data-compiler/index.ts @@ -95,6 +95,37 @@ const categoryMap = new Map(); for(let cat of categories) { categoryMap.set(cat, catIndexSeed++); + if(catIndexSeed == '`'.charCodeAt(0)) { + catIndexSeed++; // Skip the back-tick as an encoding symbol. + // Reduces complications, as it's the encoding string start/end char. + } +} + +const bmpRanges: typeof ranges = []; +const nonBmpRanges: typeof ranges = []; + +// { start: number, property: number}[] +for(let range of ranges) { // already sorted + if(range.start <= 0xFFFF) { + bmpRanges.push(range); + } else { + if(nonBmpRanges.length == 0) { + const finalBmpRange = bmpRanges[bmpRanges.length - 1]; + bmpRanges.push({ + start: 0xFFFF, + property: range.property, + end: undefined + }); + + nonBmpRanges.push({ + start: 0x10000, + property: finalBmpRange.property, + end: undefined + }); + } + + nonBmpRanges.push(range); + } } //////////////////////// Creating the generated file ///////////////////////// @@ -107,28 +138,40 @@ let stream = fs.createWriteStream(generatedFilename); // Generate the file! stream.write(`// Automatically generated file. DO NOT MODIFY. - /** * Valid values for a word break property. */ export const enum WordBreakProperty { ${ /* Create enum values for each word break property */ Array.from(categories) - .map(x => ` ${x}`) + .map(x => ` ${x} = ${categoryMap.get(x)}`) .join(',\n') } }; -export const WORD_BREAK_PROPERTY: [number, WordBreakProperty][] = [ -${ - // TODO: Two versions: one that's BMP-encoded, one that's non-BMP encoded. - ranges.map(({start, property}) => (` [` + - `/*start*/ 0x${start.toString(16).toUpperCase()}, ` + - `WordBreakProperty.${property}],` - )).join('\n') +export const WORD_BREAK_PROPERTY_BMP: string = \`${ + // To consider: emit `\uxxxx` codes instead of the raw char? + bmpRanges.map(({start, property}) => { + let codedStart = String.fromCodePoint(start); + if(codedStart == '`') { + // Prevents accidental unescaped use of the string start/end char. + // The backslash gets removed on file-load. + codedStart = '\\`'; + } + const codedProp = String.fromCharCode(categoryMap.get(property)); + return `${codedStart}${codedProp}`; + }).join('') +}\` + +export const WORD_BREAK_PROPERTY_NON_BMP: string = \`${ + // To consider: emit `\uxxxx` codes instead of the raw char? + nonBmpRanges.map(({start, property}) => { + const codedStart = String.fromCodePoint(start); + const codedProp = String.fromCharCode(categoryMap.get(property)); + return `${codedStart}${codedProp}`; + }).join('') } -]; -`); +\``); /** * Reads a Unicode character property file. From 47d945f30934bd12b0c16af8a104308521ed6cef Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Fri, 16 Feb 2024 11:56:44 +0700 Subject: [PATCH 2/7] fix(web): conditional import path --- common/models/wordbreakers/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/models/wordbreakers/package.json b/common/models/wordbreakers/package.json index bd530aaeab1..774ceee0422 100644 --- a/common/models/wordbreakers/package.json +++ b/common/models/wordbreakers/package.json @@ -18,7 +18,7 @@ "types": "build/obj/index.d.ts", "exports": { ".": { - "es6-bundling": "./src/index.ts", + "es6-bundling": "./src/main/index.ts", "default": "./build/obj/index.js" }, "./lib": { From 5045df13dbf5b14b699e9cf7898984f5d05cc2ae Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Wed, 7 Aug 2024 09:37:07 +0700 Subject: [PATCH 3/7] change(common/models): improve encoding format --- .../wordbreakers/src/data-compiler/index.ts | 55 +++++++++--------- .../wordbreakers/src/main/default/data.ts | Bin 90011 -> 9188 bytes .../wordbreakers/src/main/default/index.ts | 39 ++----------- 3 files changed, 33 insertions(+), 61 deletions(-) diff --git a/common/models/wordbreakers/src/data-compiler/index.ts b/common/models/wordbreakers/src/data-compiler/index.ts index 71ac2147686..e4b5d72357c 100644 --- a/common/models/wordbreakers/src/data-compiler/index.ts +++ b/common/models/wordbreakers/src/data-compiler/index.ts @@ -2,8 +2,6 @@ // Original version found at: https://github.com/eddieantonio/unicode-default-word-boundary/blob/master/libexec/compile-word-break.js -// TODO: Adapt to produce two string-encoded arrays - one for BMP chars, one for non-BMP chars. - import fs from 'fs'; import path from 'path'; @@ -115,11 +113,13 @@ for(let range of ranges) { // already sorted end: undefined }); - nonBmpRanges.push({ - start: 0x10000, - property: finalBmpRange.property, - end: undefined - }); + if(range.start != 0x10000) { + nonBmpRanges.push({ + start: 0x10000, + property: finalBmpRange.property, + end: undefined + }); + } } nonBmpRanges.push(range); @@ -131,6 +131,14 @@ for(let range of ranges) { // already sorted // Save the output in the gen/ directory. let stream = fs.createWriteStream(generatedFilename); +function escape(codedChar: string) { + if(codedChar == '`' || codedChar == '\\') { + return '\\' + codedChar; + } else { + return codedChar; + } +} + // // Former entry in the original version by Eddie that was never included in our repo: // export const extendedPictographic = ${extendedPictographicRegExp}; @@ -164,37 +172,30 @@ ${ /* Enumerate the plain-text names for ease of lookup at runtime */ } ]; -/** - * Constants for indexing values in WORD_BREAK_PROPERTY. - */ -export const enum I { - Start = 0, - Value = 1 -} - export const WORD_BREAK_PROPERTY_BMP: string = \`${ // To consider: emit `\uxxxx` codes instead of the raw char? bmpRanges.map(({start, property}) => { - let codedStart = String.fromCodePoint(start); - if(codedStart == '`') { - // Prevents accidental unescaped use of the string start/end char. - // The backslash gets removed on file-load. - codedStart = '\\`'; - } - const codedProp = String.fromCharCode(categoryMap.get(property)); + let codedStart = escape(String.fromCodePoint(start)); + + // Offset the encoded property value to lie within a friendlier range, + // with characters that render naturally within code editors. + const codedProp = escape(String.fromCharCode(categoryMap.get(property) + 0x20)); return `${codedStart}${codedProp}`; }).join('') -}\` +}\`; export const WORD_BREAK_PROPERTY_NON_BMP: string = \`${ // To consider: emit `\uxxxx` codes instead of the raw char? nonBmpRanges.map(({start, property}) => { - const codedStart = String.fromCodePoint(start); - const codedProp = String.fromCharCode(categoryMap.get(property)); + const codedStart = escape(String.fromCodePoint(start)); + + // Offset the encoded property value to lie within a friendlier range, + // with characters that render naturally within code editors. + const codedProp = escape(String.fromCharCode(categoryMap.get(property) + 0x20)); return `${codedStart}${codedProp}`; }).join('') -} -\``); +}\`; +`); /** * Reads a Unicode character property file. diff --git a/common/models/wordbreakers/src/main/default/data.ts b/common/models/wordbreakers/src/main/default/data.ts index e1cd8aeb88c6d059e7c0f5e4fa04c9ea76e19404..8713fad43c0a0abfae8ca05a9d42141aa230afb2 100644 GIT binary patch literal 9188 zcma)>TW?&~b;tXxqWugzid&gF;-Q(|ljgyRV>C@{xw26sO$&>b#x48LI+*I$4-g4;bm8r<2Zg$qNMFJDlX)jyBi8ojCh zW8~Id$E11>Eh9=~s52qz&cyf~$Hdf!I!k5s4=2_5ovG2=qyH(|KN$H?On5GRka5R% z)ZIyky8gw!*GBGKS5xC0aO|d<;xxBM?%Z+Q6j5g?W&6b4G1>WC`4>CY*a%~z@2N3| zoiilz~^q@4kD>@y;uKM5>dfNF@iRqd<~q*!w_dvNU*GvMUA{j)HZ?LjHu+^{ zoW+SQhZ>oj9KSJome6N;c7+jwPQLQ@JW0;8M&keOhw7~l?>OE(Pio1$>tDt7y1Fqk zNkZNl{TbQx2SL96f26|~i9^b9nxEqRi)U$YQ9OU~$FH8_^AtGmeE-@xei{Dr?$?8V zKJxX@pO2ge{_C6ooOhoG{mZ%Y$eeed$NRkV^Va9kpQ?AR z{Off!qWwZWa9w;bryj(#2XXZvIrt!@9u&0)CH0_u`9VcJsA&(5)JKN)(LMFiruI=; zeN@qYxv73>YyWNN4>Q`sqIy`;9#+)Dn)a}H>0wKIWT;1T+N1f)j~28?%j(gJ_Q9|tdg9MV2ny!6SE z_Q|sP#MD1&s!v+_V?#Z5>W^pC<5~UjoO(R3KVDFeJwuP<>T!DLab9~|xcs=No)`m9 z=G2pwOHWqS6OaBRpq}guKC#r3koI(0e;OKi8dFc>15Xo|o+j1Pl=d{EKh0{#>*{zz zJNE0x*1)l?j)VH~zIJ@5A4k=3Og~O($C-iSns(e!$1UwsLw&lYeQN5TdbLk8`fr!j zZ{6DO4)ot;wcq8`Gh^V{lJ+dFKPza@O6pnV(zB|1R@0sxsb>xSSyQz`hSm=G^>%0* z41i&9AB=!eFs0h}S#y6&Z{Od6hv0EAqqTPzRC{+3TmwA=?OiXpske7ov$v$T_jU)` zdwY62Jgc?CHLw9T_4d9;Yww%jCb$cRReS$npuL{~vs(MWskaYiRr|mNcY|x73Hrb& z7z49l4$Olkum;xIhmHeu9J*Bda2{L)m%&x7eTbch_u!l04t*O8g4l3~4Tsop$hi)4 z@H*H4MOI6-BiIzdrU*7gmO*TaU{eH}BG?qcrU*7guqjdii(nb7fY=tnwg~bf$cxUX zb`;y9*cQdM=qiXmqWCYmrM06LXoEozo1z)!kQ*(6CDo2$OAK3L*b-xJ3|nICjbTd+ zTVmJ}!`S0Of&K*g6X;K%KY{)P`V;6+pg)2B1p1T2CyD+f zK1iZJiT>mUNNkeCCP{3P#3qRyN$f~sM-n@d!c`1CF!l$VaxCico`IfvY3xelmo#$IdXU=fu46?JFIg9Kpva`s}B0I~Ov-d#m zLKa)I7RcGN#30Mrvz$GPzAXB3hStvE`y6M^p*M%#9C~x;&7n7k-W+;!_&kTtbLh{Z zKZpJt`t#_|qd$-SJo@wK&!az&{yci~!tHiP}}hALV%v9cAKQUIRBk>RFk6Wol8Gc$bl1Mt(U* zpEFdldb@(WN&!4l?J9X##l9*vth%k*HRfx~*O;$yXO9;2_7U?(%pWmdM{nH;BEK&E z&_KH$1G%4d?q|IWR=_Gq4C?r#!JY>7vO&FUBtdjE@lz8&H5asY(+%R6CUv%n&KBdX zdDSuEddFy~j?)78qUuaf>z(OUa1Hbfbf&jKn|?^^ z%*?6I%sjXNxc)%mDu5}g{RcCPtWPFkF#e3i;xCL&5$XP_r66=;&x5T<7)-7eh z99RM?Aa*YsT4&h_PJ`%OUI3A|jJy?7bymEfAEfUNsE*sxJMNI`tg>&_1EOaQd27g9 zYpRaNsdcuJsx$U8({gmsa7dMC1@ zI?)Zi!!w{0WlyvLw)9SnJcyA8G1kXeA45)jO?486-k~0M64;&ulWx^XvL?xzBsrAi zTq)L2GdrpKAZJcx20E#l)=3}eoiy_qbZ5ePCsR?KEPAr&$&w#g*A{=$*U|TnD3!B|&uLu_s>y%OJY)>?t6lj4c(gPJHUbzK);kA+6IO zM;hcvgPdxhqp=BYft;fe0P#!10$JZ6t_^HzBBx1gnMv7 z4=;C($i=P^V?KSQYh)N}UFkZVs_UG+)OEVRb^04%@P)23G}z_NcAW&$^I ztvhX~?z9KARCmUscW1JyJ3Fm)XJ^29Z~Qc+Q)beh50mPOvKA?to z%bQ>j4C&npHdl~c$*2=zbYeI`mv&;zfie9Azn>V{;S-|-9x>ifC(~Q{$#h`&1b?1P zhlWq6aVInN;geZIoy@X#mOZms?PP9Eoy>)_lX=GHGs7nf;Ie1yVtgyD_qOuz0?68}me$+0^&WMq7s%+n zKu+~`7PMY)S@nYK4|+i#xDIZB+h9)Xg{JjhXa<}E=fP!g1>6H8suxOtN!8;Wy?4K$ zdb@_++nv^WyXf7UQ@uUUKyMFOdm;MB+pEBjz&h9fTY4{S4D`ZFS}%M6PpMuwqxJUZ z_1-?V>|^hKL-h`jdoZi@4)S{M(5ZTdYg&(*)uU$h4)+;505kM!sz(j$MWT9-_upO= z9-Yy9QT9exz*WZB7iC}ch(5ldhV^(~?Zt9n5iEmMu&MXrhU&$g;53LW@i;sK=PdCe z932U#)=SKTi{Kh)g7-kyCl0{~mo- zXrBe=)n~zFu&(+EL+hiX&pU2Eu?bp3{e-Re6G1RaKL(b;3fKUfU`zFr#!x>w4bFfr za1Qin{Zvl%)5t0`Rli8TSSL>G-RKJ`7(OEI{e#LXCUoqjn%l*nad;=b0JOk#h z^eaWxuez`FtE{WCuIhcUU-f~kt!`ZISN(A0Rb^dC>mOm;QERYYH&nkq|6;$6zWU0g ze%%eO(q98T1O58;<$gWD_zwO1TEBh-wyyLW>}xo+ej@-L4ELM(pc&Ho)Z0Gww%p^! zBEn>LBMjR(W1?+Cj_MaP{42tI!q$a}4*xx2yTbN_<%H#h)r1`hYrr-|$4(i>n-Pud z<-LeT^x86J&k36smKBy0CKmARfJSCRa>~%6uq9z7VP#=eVPa`Ww1p%n_eBTa5okm& zZ%#Bp*q)5-<%Jc5i4@+OXkI;5Vk06pT%*>KZ*m5h)N1e zin+;gL`|}!7H77W z6efAVkCao-kt8rbNod5#`k^opP?sf*Ic6J@$_?o@r|C8&WKHSvc0>~rMkq|f&}n8g z8eC#J@t^5D7@)~WH_L=GFRUnx^O{Z^W=_kFX<0Ij6V2%cJEpl&<_y}*nHgbj#`v8< z<6-ekR2T;|XE>laD|%-|%Iu*q?xi_fVwSfQnpq}X1k`kqgr6WA` zN0<{)b0TU^X6K!-c}{80FG|-fY*m;C@$=g<8<4RbVffEnz?0?z(J~i&!V)ZAAlJ-= zrZ5uNTqMoR#c9T*WSC3hrX_BG$qynLZh*;8BAPW}+;ej&C`=B$gxk&KAOe=-beF{k z%i@C-F?@yGHCJ3LSrJQDL;$~%XvCHk5wH>y#ziz&xQOP89DhaPv2rHmNTf8RE9Sc8 z_R3zLM9@dpnLgR(lL-1`n@_g+Wt~6GI=`fpUsCU$6LybE zXx`hAt~gOjmANVMwq$logl@?Jw?x#Ih}xDV+guWJyCRJ0V+IUn152+VPYvi>1gB# z;j%DM9+r57MasU!cVE(RUv9;|M0#Hg-xtFVMDGE&*5u>nsd;F!gwK>T+rsY4nE3Q? zPZ%}AJlv-%RnL^FXGWHVNy;eK#Kkem z&bSyZrOM1m(qtr`GZLPRT(FFUCo8jAi2}b3X+%m^(lN)A&dhmPl9R*aqQYdG6gRUV zo-D{sDTpTvVsjzOlET@NnlOn%p)O1+NI~jLagKFGIY&{>Q52O$QCSSh#DOrXo>`Rp zQ564`L?*1gbvGSbavAq*+~VO5H20 zAY)=AUv*B+h7`Ak7{m|MQ&Wnn+2nG16@pCX|C zs;sL~i~aJP@i(~k{>BX4AGSbi@!zM`dcEjMcVpXq^-Xt z7PNSHZMQPRG}N%|R#6zQJ=?8@Fb)$ijA0tmEHE2_&F;Zm8!#8n3An<-NV9+>YamNK z4PTxgJnGPfh)-l^lJojO+SG;p@%;>m8g zaIED@3d;}DaHFg_!fnm*D6-~w1GVO2!q{sqa;L3DijgHDv&;>c>4$lFrdVEE7%s8A zyTUk!<=q!{C@db+qy_u7b)v_(%KNKHpIRSE|aw(r`+ITTN`AywLxZE+uS2-I|2*faVx+L zumU_AtpL@+3Xt+vfEPJyN3`vTHom9P@aVNHu9{`xXUihdEsH2vHomuP(P48hEt`93 z1;u|sZj==i{{>eEPOT7mWkraB#qUKL@pDwX!M{6bsHawJ78VohV*z2}$=H2i8JOgx zRVS)eU3^+64{UjpvH6BWvmlHcZyW5j4bssz_Jom%w!uTrHqLfnv&|ROQ=3nwG~~TK zi)4EiTkKhK!=B}h)}Ey-*)DFG&8HKZRmS*ua%%GhlqSWxC46A>;gg03i!IN1Tb}VY z9}Z|_%qL?$84HFO3z7$R5Z~KD9#3|VUl?{!qy)vPAQ7~s0N8u@z~1v9VDBDGUXJa3 zoMZ1(O6`4oYRfalmS>91=QSEK$F7h$b_K856`mw^h5WNCVPWEf3boU&aF6T?nP^vd z7qu(AfZG+SfL*By6H6;(lU*Sv?JBupSBa%vB@^u`ci*n^fV8XPlB(EJ6f9I~tYM6s(pQjSYH+Et73hXdSF-isbE*dlkz%k*TmAASXvWpHIZDS9@sT< z$d;FFyA~J5d4sb=At>Pv%6n%}DnrmEV{7R%AzV~^E_ubD!^BKjj;Qs-#?L<-l literal 90011 zcmbWAYi}IKm4?6TR}B2cyM`@JbJoy!>_bS^{ZFc zS7)!TFZP#rXRohsFMhb)?|$Cj-ksguo}XRpj=RtDGdCH~D|1KFyCVPk(;5y1qFcULF3m zV1K>)bs^ym^w09&_kUd7-|bf~-oE#@-OY>D@j5ejb+I~TZeHzfZuS=os0WnG>$mq; z%bRbIzkPFcwab4yyj)%F_ZRz%g@Cgs`(JLZZx@7DcgNNK>i*U0+e~Y}@cZ{Sl&=2w z$!hggMs@pX{^5&x`RAAW*Oysd%a6me<)7bvz5ns+o89@tpT_I^A1?RL|Lgwx`0)GJ zhpQjo|NU==i!bkAEw7z^`eJ{40DAvwe|tE8^Y{G6;|XQ=+mF*P$Nkmv@|#!Y>)WjT z<&D#SS$_KSK1=%b^WR?j$G3l3{{Gjjx1V=cyXEI+`yUV2SG&vSpI=>M+c{o8@OF3o z0BV1IeDd!|G*L_y)-QD&1;lbh_4B^xKm#md+Q2Bpvw0KhM zqM#psU48TG&Hn3?)h(v+;y1y+Se@_gvc+5;e$MKCq=grtueX}bHnIBC>buO>M-O`X zXkqcAFXq#~-{|V}%kb>9@zaJpDau^Yla3=SNSz|D8hbj=QVl zX>~s6EgR_D6??c^ef!n3@p=Dj>i+ut@6W#a`}FLa>^yk2eEzUHzCP~qBKp&6y$C+9 z|NDLZB|Z7~I~y7w+{;~d1uwB1gWtX)&|7mx-JI{um&E|Sp&xZ%2I{dgeqs47} z_TV=18;b$GRru9*-Q4O$Uim-lZ(l4(7Rs)&FF4-*y1Ku~WPO_99amS^*->7d zU*2D2fS*2qJ2{*zum$$j{+Gprzklz+ZBGwAETQGG<;iTGnSi{Q_rL72boY6I|6Xe^Z?9jiPFNQWEu>D8ctavBbuh_4tiC&G4Vd`; zyYJt4gzwHid9Z>{&KBC>(WfVV7t8(KvmHJA#r*xF2T_P`oX+E?M*=%Md;H);4+>ff zWbB=fdknll6H(9$BISLT2?$DhMnTkK=>wWEstbdi@ZN88x8S z$&5OPR^seLEft4QG7GKNj*_c2Zk%eBS8HcsRSA`>ET~G3cB_PxtBj~hjytZnrX!7@ z+92tYQ})GLrxos-Vr^9w5)(j2NlF{2EJ;>qFnW-SjFs&ijvT4 z-Vp@!dTl~axT2Kk>xB7&CZ0oj37#FR;&p>^*Ti(zdaZjDjRxPNY!vYvS4yL-2hS*L z4xXgsIj+&nT4`y`Wh%Aj)U)F~YIC{PGj5nlPrSIb9MLxk;0>>BDfK2<7ThGyW*QF3 z6t7KzyQ`+Yz&%HU#}p#~k5;3>TbkP79-@ii`hg^7i$R$(8xANnF&rW)=p2)2=v?bl z(7DhMK*tSDL2pq_(yKPD6WikT>!cs5;GO7hL;A&Ewa!XwjV>Pnd5lujoZk1lz5gFCc zdE=I>PL_w9a4qT0fQuuV_!)OHrPbLKJZD=HI$ER@Bv)Pp@NRAGV;zS}5;%GRDfMoR zOI1Sm46@@Qkb<7jw`CM`gRSS+rWzU60J9Y?ySA8G__h{QB~6J#he`nsk8E>&dirO zD_`o&e5vcLea4*O?Raul8D#J61ZKX}@wARyOoO$UynLzSiklLQmoIgkFDc+WBBOoM zh~cS%K6&M^&P!1`D@Ezd6s6-7MX}1wBP~d7A1Qn6xn(1W>AfVPw~~l{?S-5D8Y>%; zbT5hMtt6s1lZf6*B6>54=&dB8HFBLaM{ni@J?90ARkPF4 za~h$b^QgVF0iLzjTD9_m-pmVnYks@uEJD+6^-+4xA_C}61lhM<+T6EJWv*{Awhm~w zw(J(2l7`L=YGuIY$(6oCrzfGK&kJa`b;bvx2FDkPJpD}uv$PpCqNoF z8kkb4nbP!DO4EA{quy#5^=69FTPe!GIfRn>V8;1_73U8;T1n6dKU#ma*fD-Rpo^rVSdvGTCN9*qX>c#Xk}l?Urs-@uWzHg@Z(gn>)w z0dz|+=G5QdMdE`Mi4SHZK3I|Xz{i=jAiWy;V8zOV87mLg%+X-R%7Yav4`!@9ST*#) ztf3E9m3uI&+=CUf4PKRdFss~y7ZMKERM6msgo70l4qiw&SXJ(Uk66i_gJV1qZw`FG zN_!2S+fmTX68hkk&>Qp3DOHWM=RtU&5(0!aRf?{-kO5pVFm@A zN58e$IS`Z3u^<{icT%j84`piTJZ_+9_adv2#|8n?W}q}$fzrss(2{g3@R_(Nt$ z^VKm@Ff&)I6?C2jQ97MVbkhq4EG zXA-0wIdD`;;(H)9aFmPYlnbE}JfE&sv~!jF1ZMS5D77}^Qi z)0)j)x}^Mqv-r|_i(W>PZ*9OCJVxvqJl`KJrQ;>EXj=4$_)W zE=cg+j8a;2^Fo71L)NM_Z*xd%&TIn&p7eQMMmoT>WH!H!VVUvS}2!spZ_fX}5s4Ij-Rf{#&}hL0OX1mCis!t8LT zf-jSi^qaTVX80cc?5_g(aJDJw=WH{Ak6X2d@7fRFr7Pi^<+W7sb-Eh9cdjqP_vFJV zb|8PN5SGO4{2G2~wU<(9_LuStmV8Riv;+FB@tRcfe2v`ROZF$qd`fNr1myF2m+;;2 zp|md9OC{g37tlXj?rpK~#}(hxsBEa3 zP-OxAd>l@}7q^XS_~J%Txh8ARrPRr^iJHscnjS8(N%?Sc9mt0d?Q43tTP*2e7Z=du zt;w46(*QnasFHp@L!-r0V>4ZX_bTqGwl0j$k7GimskiPGN;S8-bS0iwje3bw!rgV61;a^c53;Af&|YqS9*Q5T(FSv zx&9Ntx7viMbG6$gTrW~x9e-s)@!NSv6_R%ePU?7X*VNaz8%gT<8)F(gdpE7!j>|2Q z9?qr$_+0Ik@Y(lheyt995*I^A@b2vmNn9hK5%4^w-Yj~)PhZKiXXl{x)q9Pgo^P$z z{Cc5b&*rSbn+LK}-{BOsMqr*%O+6cQpa4E05m*FIPiYpc0jDI+L27yizHwgCGn81+ zR2DsOa;F6<4#H{p=%53B-6b~g5(_Z!cPS+Yyl4XWUWI<(Js^N@edRT|$9U4fC2J`k zj)1g*j?PVoY2fr!gXhoi1mc{~LIVV5Ici`Jp%L)c(*h3o3zz`{v!^w9Jxg(XMKd5~ zkR=0?8x+r81x}35X$IzmHldcx%qdj>pR*VVpQpJZ_#XZ2qXYQXfsHitmqa4?XwgzW z+=GeWTRH8_O`8b5DSxv~488pIepk=P#@W^07wQ(#$7OBlsv#N&jFrU1#g8 zYnr)78_+-C9wzDMEjxhEo3MtDJ=6%kx4yVY9Kh#2UeeEJ90K^fWo!7Ze5^a-i#6|$ zL6b1Qf@LGYL4F*wj2HusfNBhA5-KLh6yXy1$G3lZ{FUAOF#20n)i2gurI)nxP{8Ti zZ(r<>M3FUuU#vMR3^DMh$RrO18brpB89455iH8D>;>-Q(%fr&3#_EE#wz4PLKlS>U02~<1`81tTYts7JaIO&tZjj*?i}M0-p~N(6`{V zc=9O%dO^cSj~Bs5f2ZN2w+rAWbP^JN!m<4bK87J0K6Zv9__)hT_`IhD@Vy-^707M` z9}Pm&kLD1;M}cbiuKg6I{L!BW^jlZU6^T2y0er4aOZo5~8No+^YWOJ52tEo_!spE~ zfNvF=i;bCe=ShD{K5lEH82E$iNBk<_w%jTqpGkn}3rF^^t{pFeuW<(5I8GO0s9V0?uwT4S^oHam@xR$L% zn$OZx&aYLLJe$ks!%X=6!8l1jYapQCn${>2U-lFzAQ#JtW5qzhn?{7d`bKS;oeTOsCaV?1eM}&dim5Ywk{J zhHLBI!LD{!a6Mw_TGG!WasmBTe?HUiZIbx~rxMiFtS(J|!=-&`4-HrLBLo=u1qd*c zj1XXVJ3xRQD?m`7OVJ3d+D_N-sht446Y1tJ_?mla!!A*+7;y|ulh3#Q1mgEjRdx-J z@JRaEqv$QN@m6HxrNUifuV6liFZt!PA&@`cgQMZ2c}DPYkCX-^PTgsRwQF>tnxdAA zznUVe+mpXEZLY}HyWzBJc?LmgtTnqIU27fv>RK+gYH{-I)q%eF5@?OUy6B~Axpo)O z$*sUB}G{EgkFLs|+49h5$T<^inw7DG9(k zqY9aNOi(p=tBBQg>^cMRC`>6F4wizI!kuYN5BoMrj~k42JdPaDW4S8vg?BBV-Wo2> zD+`S+f{($QlBf7=djOx$)JynW?+V~^<6gt}`j*+3c=B;W!@k1UR?^R7mw|jHZ>#Hh zJxY2eZ=>t^LMjQKSG(TmdM}c3=Wg;xVzAsO=pSp|>7=^G+G=Cx$8n<$;9I%iEN&@~ z@be3fJDmH3))~O(;{ihc89wH+f8PIayMO&$P2=a+tUWkCk51x)XgDmE%y4P{gIKI! zdhYoU{D-liq$MwWC;x}Bpr9odP7FSXW#hy=bI;kthp{+eXU2kM*bidiR3TVl96d`B zoAvftd?rr8A6@!Kp46A{%}<`rBTo)V_`HK@`H!B57K1k`UY#fIK&sO-`FU*GD|RP# z=)uvai~+aQfFf(%OdL^a0dWK;87^@LJAuz}3ZAQ)C+ix?dE%>fK7@pqju!0JQ#D^z zc>*uk-lDLiY`F{?Fo2ekpO-uR#9yb76q(`oJXv2<$d99w*WO~XPPfhz&x@#2UbFbr zk`m9%(O+Z|GxwV{2Fy34NzB%KM&=g-znB3lB$?U&%8X_fADD`adQO-XXdzkm*v)f+ zenK1R%(H+}$N!&Gq_g;JlSIOYP6HGc8_R0Yhe~p3Ti^w#rh#}v3z^fs| zfC-!AVPkK)8&eM()`LtimhU+_EvWpw+0@8ySTnZE^X|63F@qm>&_ADz;ZQ?rddqdB zz?Qa^w;r};F(UI|dW~(30bj&J5T4VY1sCE!qTlK|^!5rmW-aua|rBvII zK+WHM4lrO3IM8%$4R~dq*}6jvo?mKgTWYNHz4=?XT=)!lXn3|q>Y*{mrZY{}8QdUs#1#rJ+94AwnR`HR)Irdz(LL+YXB`O}C8YXUXpgYr1N9AN19ZU&tm zcWVs#B-0%qeUA`$E41U%X@G$HM3IoVkFI$(@4U;3vxcvh=h4|mJeV6$$HPj2Sk3Zp z$Gby-fI~#B1$%SICrYU=pfT7kqch8n9Xo(X&s@sUJPfuac}Hxru}a5k+m*b8Vdqbv9RytE$q zN>T0kyc_W{0xpmRh04(kmL^+qlUWL#_>aV3AibmDz9Kw*ugXC64C38W;(c%&YZdGbEf#8;UI zOtUwR`I`)6d=Uok zp0Jb(4lJc@6g+vVZKL4oX2b)J%?Bu~vy_= zswGe9OD~fzqBb30X`La>QKju#OY0 za9a=JaBX$A@JOo7!HpRAj_HqJ9{X-(qk&OxPt~PShs@b<>r3W zIVYC5d;*fZ;n|D;YfBLVF9I#y5?smkF=-d}D0$kj$Iv`)c<}`5dE*_G$jLfZL4xdL zV_jvplml?N1x#<<{cXd>sx@cFxMJi!V1mz9M4Gd1-OY04n~FjV7QpJCF9*z%X-G)X+J6(Hc6hDP9?tw>wW)&r78Tyl~mdvH5d+`rQ#Rh+kKlI*%; zWdZ^T4;C1T?jLp@~#=%Om1;LU}m)}0H@Z(#nQrNncj5f8kC^crosGo?x5lME38&yTgdej)^J7~1l@V1R-* z#Yhia&5claMUB?p6JI871<#sLC8wjc2@VaBdW z;)`iR3}{Oc6IctCB-sZOl6x>AzknsRfa@;0VD9mE8dK^xcnDD-QChHKDqahg>j(h? zv%@Yf#E*F98d*Tn*6mq#+y)9zv|cPA=JX@3%`T9*sXxNNw-!m$u*bL4z%f>U0i_!7 zFz}t;?#UYma!~ySd? z{H+M71$#_Cjb?^2y2p+)23$V@54f8IdLFsAro}p%>rEUSh!7wTnrA+y93k*ZWFt>% z1PFLuJ<@`^Nss0xEv|A768zD0F{68)DUI9)(u%Gy)s4hjd5Pl4AB2c7xN*kF(<7Q` z_QQeku_F!;;4D>y09y%>kobGynj}sWBV*;XPhyz3-WBjL@u4p*rpY`kA&&NF1T%kp zG~i+8;}sDK%=#h}=D5kslYbG05@`yU;OdXGjoF>#oB10A8i5_|&89E54-cjM84VgN zQat1lPqXS>lwN{W*e&*A!(Y3WJZ#(^O|j+9l*BNz^VZf~-!#QxD~a%em@m z+f;5i|B(wWzkobwuJaqXvj-@=!`7JvQ%swRNm8oL{XP>z)p^(PR2}O zaR{gP-Kyh&Nn_x_oQMbO;_Ot3`w$`?yq;y%@n}TEgZ1I9%ma_Y%H@z>u!^mfeSbpL zv)9lh^_|`4b&PkwVbr@1D@}`ePL-zk*B$C~$j-QDDa0duIRynaBc<tLZrpA>K-%YL& zZ~-{rptH;<-^hjAx<*iVGs&Y}q8K;4l}H(FcpuU3Xxy-S(HMBdA;Q4^MPuOG+d~Z2 z20d;%G}RCTR<8mca2p9Rpa%#rpm9eSde0|q_-dUHgB4MZ8@`Mu!ocTlw3PUMi2%da zyJtK%UC}Z^4Csj^4_iJ!9AV%QV~v6DVGc2P*1F{{en%L1%ue&bEuavC6(@~bw``K% zKtH2-aEA&q2DH8a1NwkSN?Zle82Gcb0ft;*cG9cdz{i4Gguy$*HCB9uSA@a4p=GT2 z%vXfL`=Hxc@fUjn47KSsYTIknd_Y#(Qq89fA|ALbs4=)z!Ljx_kYnv8xnu36p!s=s zP1ihUr=`S^cccgJPP?(@eLckB#TqqV?4+AUqiFX@DT(oRoj!D4|77fViwh8NR;fAQ zBk}}eXa4k?PqTrh-B+~^Sq>o6NFoFq4;x=NSIrI9oY2(hlOdFSpjm^Zyy8~{j z-5z#+!`pu93Adj>*ENetT2nn?eI>+TMFCUIdWbOa@qWpJh@S%t4Oixc)gIBIL<`g@4M^h;YT%7hq9 zzr+_{h8Rp++S+T{8~mN7EvCmI59XS7Z}H4DVWJh-62^!T4}4reW2o$1ln1H<47RoM z@J@ii-bKB^w`r=-G9o?j2P-rW{Jn}0gSp~rd&POfds9x!T;IwcDAH1L#{;K{do2M5 z+iSFF8yW)_q$5px<9pNOPRrys%zc{gMT~gpEh)|U-geKHX0zxuC*IdJ53@NVI48H0 zIOkgeiECv`bDFSb{A~Z@;reQK`TX;%i^KWuczp}^BF*BHAQD;04Q-hz&tiH`BIvoo zq041+Z&jL>HV5u@Y6x?A!RZp@=Q)X0GRiNoM-wud)}jKg5hNDsTLj<#@yY)IeG5BQ diff --git a/common/models/wordbreakers/src/main/default/index.ts b/common/models/wordbreakers/src/main/default/index.ts index ef50ab4b31c..e606a3fbfb9 100644 --- a/common/models/wordbreakers/src/main/default/index.ts +++ b/common/models/wordbreakers/src/main/default/index.ts @@ -1,4 +1,6 @@ -import { WordBreakProperty, WORD_BREAK_PROPERTY, I, propertyMap } from "./data.js"; +import { WordBreakProperty, propertyMap } from "./data.js"; + +import { searchForProperty } from "./searchForProperty.js"; /** * A set of options used to customize and extend the behavior of the default @@ -566,7 +568,7 @@ function property(character: string, options?: DefaultWordBreakerOptions): WordB // TODO: remove dependence on character.codepointAt()? let codepoint = character.codePointAt(0) as number; - return searchForProperty(codepoint, 0, WORD_BREAK_PROPERTY.length - 1); + return searchForProperty(codepoint); } function propertyVal(propName: string, options?: DefaultWordBreakerOptions) { @@ -574,35 +576,4 @@ function propertyVal(propName: string, options?: DefaultWordBreakerOptions) { const customIndex = options?.customProperties?.findIndex(matcher) ?? -1; return customIndex != -1 ? -customIndex - 1 : propertyMap.findIndex(matcher); -} - -/** - * Binary search for the word break property of a given CODE POINT. - * - * The auto-generated data.ts master array defines a **character range** - * lookup table. If a character's codepoint is equal to or greater than - * the I.Start value for an entry and exclusively less than the next entry, - * it falls in the first entry's range bucket and is classified accordingly - * by this method. - */ -function searchForProperty(codePoint: number, left: number, right: number): WordBreakProperty { - // All items that are not found in the array are assigned the 'Other' property. - if (right < left) { - return WordBreakProperty.Other; - } - - let midpoint = left + ~~((right - left) / 2); - let candidate = WORD_BREAK_PROPERTY[midpoint]; - - let nextRange = WORD_BREAK_PROPERTY[midpoint + 1]; - let startOfNextRange = nextRange ? nextRange[I.Start] : Infinity; - - if (codePoint < candidate[I.Start]) { - return searchForProperty(codePoint, left, midpoint - 1); - } else if (codePoint >= startOfNextRange) { - return searchForProperty(codePoint, midpoint + 1, right); - } - - // We found it! - return candidate[I.Value]; -} +} \ No newline at end of file From fdecc602d336069ba809bdf7143c7804bbafaaff Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Wed, 7 Aug 2024 09:37:22 +0700 Subject: [PATCH 4/7] feat(common/models): connect encoded lookup table --- .../src/main/default/searchForProperty.ts | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/common/models/wordbreakers/src/main/default/searchForProperty.ts b/common/models/wordbreakers/src/main/default/searchForProperty.ts index 45a1641bd9d..9fa8f965961 100644 --- a/common/models/wordbreakers/src/main/default/searchForProperty.ts +++ b/common/models/wordbreakers/src/main/default/searchForProperty.ts @@ -1,20 +1,19 @@ -import { WordBreakProperty } from "./data.js"; +import { WordBreakProperty, WORD_BREAK_PROPERTY_BMP, WORD_BREAK_PROPERTY_NON_BMP } from "./data.js"; export function searchForProperty(codePoint: number): WordBreakProperty { const bucketSize = codePoint <= 0xFFFF ? 2 : 3; // SMP chars take a bit more space to encode. - // TODO: encode the strings & import them here. - const encodedArray = bucketSize == 2 ? "" /* BMP string */ : "" /* non-BMP string */; + const encodedArray = bucketSize == 2 ? WORD_BREAK_PROPERTY_BMP : WORD_BREAK_PROPERTY_NON_BMP; - return _searchForProperty(encodedArray, codePoint, bucketSize, 0, encodedArray.length / bucketSize - 1); + return _searchForProperty(encodedArray, codePoint, bucketSize, 0, encodedArray.length / bucketSize - 1) - 0x20; } /** * Binary search for the word break property of a given CODE POINT. * - * The auto-generated data.ts master array defines a **character range** - * lookup table. If a character's codepoint is equal to or greater than + * The auto-generated data.ts master strings encode **character range** + * lookup tables. If a character's codepoint is equal to or greater than * the start-of-range value for an entry and exclusively less than the next * entry's start-of-range, it falls within the first entry's range bucket * and is classified accordingly by this method. From b145adfaf8cccfb4923c14dc1e534929f0a93b88 Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Wed, 7 Aug 2024 09:50:45 +0700 Subject: [PATCH 5/7] fix(web): fixes value for end of BMP range --- .../wordbreakers/src/data-compiler/index.ts | 2 +- .../wordbreakers/src/main/default/data.ts | Bin 9188 -> 9188 bytes 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/common/models/wordbreakers/src/data-compiler/index.ts b/common/models/wordbreakers/src/data-compiler/index.ts index e4b5d72357c..1ff2f6dccf6 100644 --- a/common/models/wordbreakers/src/data-compiler/index.ts +++ b/common/models/wordbreakers/src/data-compiler/index.ts @@ -109,7 +109,7 @@ for(let range of ranges) { // already sorted const finalBmpRange = bmpRanges[bmpRanges.length - 1]; bmpRanges.push({ start: 0xFFFF, - property: range.property, + property: finalBmpRange.property, end: undefined }); diff --git a/common/models/wordbreakers/src/main/default/data.ts b/common/models/wordbreakers/src/main/default/data.ts index 8713fad43c0a0abfae8ca05a9d42141aa230afb2..5a4062be830d67e97df5794c78f10c50e190cf6a 100644 GIT binary patch delta 14 VcmaFj{=|L5EfGeA&9_B3`2jT-1<(Kh delta 14 VcmaFj{=|L5EfGfT&9_B3`2jUq1=|1s From 02f721dc24247b9e8b2b9c9652e4ec29ba8e353b Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Wed, 7 Aug 2024 09:51:49 +0700 Subject: [PATCH 6/7] feat(common/models): add property-lookup unit test set --- .../wordbreakers/test/test-search-property.js | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 common/models/wordbreakers/test/test-search-property.js diff --git a/common/models/wordbreakers/test/test-search-property.js b/common/models/wordbreakers/test/test-search-property.js new file mode 100644 index 00000000000..cdf80c45f45 --- /dev/null +++ b/common/models/wordbreakers/test/test-search-property.js @@ -0,0 +1,33 @@ +/** + * Smoke-test the default + */ + +import { assert } from 'chai'; +import { searchForProperty } from '../build/obj/default/searchForProperty.js'; +import { propertyMap } from '../build/obj/default/data.js'; + +describe('searchForProperty', () => { + it('correctly finds character classes for standard ASCII characters', () => { + assert.equal(searchForProperty('a'.codePointAt(0)), propertyMap.indexOf('ALetter')); + assert.equal(searchForProperty('Z'.codePointAt(0)), propertyMap.indexOf('ALetter')); + + assert.equal(searchForProperty("'".codePointAt(0)), propertyMap.indexOf('Single_Quote')); + assert.equal(searchForProperty('"'.codePointAt(0)), propertyMap.indexOf('Double_Quote')); + assert.equal(searchForProperty(','.codePointAt(0)), propertyMap.indexOf('MidNum')); + assert.equal(searchForProperty('.'.codePointAt(0)), propertyMap.indexOf('MidNumLet')); + assert.equal(searchForProperty('-'.codePointAt(0)), propertyMap.indexOf('Other')); + }); + + it('correctly finds character classes for specialized BMP characters', () => { + assert.equal(searchForProperty(0x05D0), propertyMap.indexOf('Hebrew_Letter')); + assert.equal(searchForProperty(0x3031), propertyMap.indexOf('Katakana')); + assert.equal(searchForProperty(0xFFFE), propertyMap.indexOf('Other')); + assert.equal(searchForProperty(0xFFFF), propertyMap.indexOf('Other')); + }); + + it('correctly finds character classes for non-BMP characters', () => { + assert.equal(searchForProperty(0x0001F1E6), propertyMap.indexOf('Regional_Indicator')); + assert.equal(searchForProperty(0x00013430), propertyMap.indexOf('Format')); + assert.equal(searchForProperty(0x00010000), propertyMap.indexOf('ALetter')); + }); +}); \ No newline at end of file From 4159bab4586a237b7c846ae9fad8ea27841dc881 Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Fri, 9 Aug 2024 09:30:50 +0700 Subject: [PATCH 7/7] fix(common/models): fix unit-test reference to renamed file --- common/models/wordbreakers/test/test-search-property.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/models/wordbreakers/test/test-search-property.js b/common/models/wordbreakers/test/test-search-property.js index cdf80c45f45..85996c7286e 100644 --- a/common/models/wordbreakers/test/test-search-property.js +++ b/common/models/wordbreakers/test/test-search-property.js @@ -4,7 +4,7 @@ import { assert } from 'chai'; import { searchForProperty } from '../build/obj/default/searchForProperty.js'; -import { propertyMap } from '../build/obj/default/data.js'; +import { propertyMap } from '../build/obj/default/data.inc.js'; describe('searchForProperty', () => { it('correctly finds character classes for standard ASCII characters', () => {