Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix unicode issue #461

Open
wants to merge 1 commit into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
.vs/
.vscode/
support/vscode/koka.language-koka/whatsnew.md
EastAsianWidth.txt
src/Syntax/Lexer.hs.gen
node_modules/
out/
Expand Down
280 changes: 270 additions & 10 deletions lib/std/text/unicode.kk
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,18 @@ pub fun is-combining( c : char ) : bool {
(i >= 0x1AB0 && i <= 0x1AFF) ||
(i >= 0x1DC0 && i <= 0x1DFF) ||
(i >= 0x20D0 && i <= 0x20FF) ||
(i >= 0xFE20 && i <= 0xFE2F))
(i >= 0xFE20 && i <= 0xFE2F) ||
(i >= 0xFE00 && i <= 0xFE0F)) // Added variation selectors
// Should we instead add `zero-widths.force.contains(i)`?
}

// Join combining characters with their base into a grapheme.
fun join-combining( cs : list<char>, comb : list<char> = [], acc : list<grapheme> = []) : list<grapheme> {
match(cs) {
Cons(zwj, cc) | zwj.int == 0x200D -> // Handle zero-width-joiner
match cc
Cons(c, cc') -> cc'.join-combining(Cons(c, Cons(zwj,comb)), acc)
Nil -> cc.join-combining(Cons(zwj, comb), acc)
Cons(c,cc) -> if (c.is-combining)
then cc.join-combining( Cons(c,comb), acc )
else cc.join-combining( [c], consrev(comb,acc) )
Expand Down Expand Up @@ -119,21 +125,275 @@ pub fun string/width( s : string ) : int {
//--------------------------------------------------------------

// These characters are considered wide, i.e. 2 columns wide.
// https://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
// See ranges with postfix ;W
//
// Update with `stack exec koka -- util/update-unicode.kk -- -a`
// TODO: Handle 'unassigned' ranges: (Following is an excerpt from https://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt)
// - All code points, assigned or unassigned, that are not listed
// explicitly are given the value "N".
// - The unassigned code points in the following blocks default to "W":
// CJK Unified Ideographs Extension A: U+3400..U+4DBF
// CJK Unified Ideographs: U+4E00..U+9FFF
// CJK Compatibility Ideographs: U+F900..U+FAFF
// - All undesignated code points in Planes 2 and 3, whether inside or
// outside of allocated blocks, default to "W":
// Plane 2: U+20000..U+2FFFD
// Plane 3: U+30000..U+3FFFD
val asian-wide : delayed<total,rtree> = delay{
build-rtree([
single(0x1100,0x115F),
single(0x231A,0x231B),
single(0x2329,0x2329),
single(0x232A,0x232A),
single(0x2E80,0x303E),
single(0x3040,0xA4CF),
single(0x23E9,0x23EC),
single(0x23F0,0x23F0),
single(0x23F3,0x23F3),
single(0x25FD,0x25FE),
single(0x2614,0x2615),
single(0x2648,0x2653),
single(0x267F,0x267F),
single(0x2693,0x2693),
single(0x26A1,0x26A1),
single(0x26AA,0x26AB),
single(0x26BD,0x26BE),
single(0x26C4,0x26C5),
single(0x26CE,0x26CE),
single(0x26D4,0x26D4),
single(0x26EA,0x26EA),
single(0x26F2,0x26F3),
single(0x26F5,0x26F5),
single(0x26FA,0x26FA),
single(0x26FD,0x26FD),
single(0x2705,0x2705),
single(0x270A,0x270B),
single(0x2728,0x2728),
single(0x274C,0x274C),
single(0x274E,0x274E),
single(0x2753,0x2755),
single(0x2757,0x2757),
single(0x2795,0x2797),
single(0x27B0,0x27B0),
single(0x27BF,0x27BF),
single(0x2B1B,0x2B1C),
single(0x2B50,0x2B50),
single(0x2B55,0x2B55),
single(0x2E80,0x2E99),
single(0x2E9B,0x2EF3),
single(0x2F00,0x2FD5),
single(0x2FF0,0x2FFF),
single(0x3001,0x3003),
single(0x3004,0x3004),
single(0x3005,0x3005),
single(0x3006,0x3006),
single(0x3007,0x3007),
single(0x3008,0x3008),
single(0x3009,0x3009),
single(0x300A,0x300A),
single(0x300B,0x300B),
single(0x300C,0x300C),
single(0x300D,0x300D),
single(0x300E,0x300E),
single(0x300F,0x300F),
single(0x3010,0x3010),
single(0x3011,0x3011),
single(0x3012,0x3013),
single(0x3014,0x3014),
single(0x3015,0x3015),
single(0x3016,0x3016),
single(0x3017,0x3017),
single(0x3018,0x3018),
single(0x3019,0x3019),
single(0x301A,0x301A),
single(0x301B,0x301B),
single(0x301C,0x301C),
single(0x301D,0x301D),
single(0x301E,0x301F),
single(0x3020,0x3020),
single(0x3021,0x3029),
single(0x302A,0x302D),
single(0x302E,0x302F),
single(0x3030,0x3030),
single(0x3031,0x3035),
single(0x3036,0x3037),
single(0x3038,0x303A),
single(0x303B,0x303B),
single(0x303C,0x303C),
single(0x303D,0x303D),
single(0x303E,0x303E),
single(0x3041,0x3096),
single(0x3099,0x309A),
single(0x309B,0x309C),
single(0x309D,0x309E),
single(0x309F,0x309F),
single(0x30A0,0x30A0),
single(0x30A1,0x30FA),
single(0x30FB,0x30FB),
single(0x30FC,0x30FE),
single(0x30FF,0x30FF),
single(0x3105,0x312F),
single(0x3131,0x318E),
single(0x3190,0x3191),
single(0x3192,0x3195),
single(0x3196,0x319F),
single(0x31A0,0x31BF),
single(0x31C0,0x31E3),
single(0x31EF,0x31EF),
single(0x31F0,0x31FF),
single(0x3200,0x321E),
single(0x3220,0x3229),
single(0x322A,0x3247),
single(0x3250,0x3250),
single(0x3251,0x325F),
single(0x3260,0x327F),
single(0x3280,0x3289),
single(0x328A,0x32B0),
single(0x32B1,0x32BF),
single(0x32C0,0x32FF),
single(0x3300,0x33FF),
single(0x3400,0x4DBF),
single(0x4E00,0x9FFF),
single(0xA000,0xA014),
single(0xA015,0xA015),
single(0xA016,0xA48C),
single(0xA490,0xA4C6),
single(0xA960,0xA97C),
single(0xAC00,0xD7A3),
single(0xF900,0xFAFF),
single(0xFE10,0xFE19),
single(0xFE30,0xFE6F),
single(0xFF00,0xFF60),
single(0xFFE0,0xFFE6),
single(0x20000,0x2FFFD),
single(0x30000,0x3FFFD),
single(0xF900,0xFA6D),
single(0xFA6E,0xFA6F),
single(0xFA70,0xFAD9),
single(0xFADA,0xFAFF),
single(0xFE10,0xFE16),
single(0xFE17,0xFE17),
single(0xFE18,0xFE18),
single(0xFE19,0xFE19),
single(0xFE30,0xFE30),
single(0xFE31,0xFE32),
single(0xFE33,0xFE34),
single(0xFE35,0xFE35),
single(0xFE36,0xFE36),
single(0xFE37,0xFE37),
single(0xFE38,0xFE38),
single(0xFE39,0xFE39),
single(0xFE3A,0xFE3A),
single(0xFE3B,0xFE3B),
single(0xFE3C,0xFE3C),
single(0xFE3D,0xFE3D),
single(0xFE3E,0xFE3E),
single(0xFE3F,0xFE3F),
single(0xFE40,0xFE40),
single(0xFE41,0xFE41),
single(0xFE42,0xFE42),
single(0xFE43,0xFE43),
single(0xFE44,0xFE44),
single(0xFE45,0xFE46),
single(0xFE47,0xFE47),
single(0xFE48,0xFE48),
single(0xFE49,0xFE4C),
single(0xFE4D,0xFE4F),
single(0xFE50,0xFE52),
single(0xFE54,0xFE57),
single(0xFE58,0xFE58),
single(0xFE59,0xFE59),
single(0xFE5A,0xFE5A),
single(0xFE5B,0xFE5B),
single(0xFE5C,0xFE5C),
single(0xFE5D,0xFE5D),
single(0xFE5E,0xFE5E),
single(0xFE5F,0xFE61),
single(0xFE62,0xFE62),
single(0xFE63,0xFE63),
single(0xFE64,0xFE66),
single(0xFE68,0xFE68),
single(0xFE69,0xFE69),
single(0xFE6A,0xFE6B),
single(0x16FE0,0x16FE1),
single(0x16FE2,0x16FE2),
single(0x16FE3,0x16FE3),
single(0x16FE4,0x16FE4),
single(0x16FF0,0x16FF1),
single(0x17000,0x187F7),
single(0x18800,0x18AFF),
single(0x18B00,0x18CD5),
single(0x18D00,0x18D08),
single(0x1AFF0,0x1AFF3),
single(0x1AFF5,0x1AFFB),
single(0x1AFFD,0x1AFFE),
single(0x1B000,0x1B0FF),
single(0x1B100,0x1B122),
single(0x1B132,0x1B132),
single(0x1B150,0x1B152),
single(0x1B155,0x1B155),
single(0x1B164,0x1B167),
single(0x1B170,0x1B2FB),
single(0x1F004,0x1F004),
single(0x1F0CF,0x1F0CF),
single(0x1F18E,0x1F18E),
single(0x1F191,0x1F19A),
single(0x1F200,0x1F202),
single(0x1F210,0x1F23B),
single(0x1F240,0x1F248),
single(0x1F250,0x1F251),
single(0x1F260,0x1F265),
single(0x1F300,0x1F320),
single(0x1F32D,0x1F335),
single(0x1F337,0x1F37C),
single(0x1F37E,0x1F393),
single(0x1F3A0,0x1F3CA),
single(0x1F3CF,0x1F3D3),
single(0x1F3E0,0x1F3F0),
single(0x1F3F4,0x1F3F4),
single(0x1F3F8,0x1F3FA),
single(0x1F3FB,0x1F3FF),
single(0x1F400,0x1F43E),
single(0x1F440,0x1F440),
single(0x1F442,0x1F4FC),
single(0x1F4FF,0x1F53D),
single(0x1F54B,0x1F54E),
single(0x1F550,0x1F567),
single(0x1F57A,0x1F57A),
single(0x1F595,0x1F596),
single(0x1F5A4,0x1F5A4),
single(0x1F5FB,0x1F5FF),
single(0x1F600,0x1F64F),
single(0x1F680,0x1F6C5),
single(0x1F6CC,0x1F6CC),
single(0x1F6D0,0x1F6D2),
single(0x1F6D5,0x1F6D7),
single(0x1F6DC,0x1F6DF),
single(0x1F6EB,0x1F6EC),
single(0x1F6F4,0x1F6FC),
single(0x1F7E0,0x1F7EB),
single(0x1F7F0,0x1F7F0),
single(0x1F90C,0x1F93A),
single(0x1F93C,0x1F945),
single(0x1F947,0x1F9FF),
single(0x1FA70,0x1FA7C),
single(0x1FA80,0x1FA88),
single(0x1FA90,0x1FABD),
single(0x1FABF,0x1FAC5),
single(0x1FACE,0x1FADB),
single(0x1FAE0,0x1FAE8),
single(0x1FAF0,0x1FAF8),
single(0x20000,0x2A6DF),
single(0x2A6E0,0x2A6FF),
single(0x2A700,0x2B739),
single(0x2B73A,0x2B73F),
single(0x2B740,0x2B81D),
single(0x2B81E,0x2B81F),
single(0x2B820,0x2CEA1),
single(0x2CEA2,0x2CEAF),
single(0x2CEB0,0x2EBE0),
single(0x2EBE1,0x2EBEF),
single(0x2EBF0,0x2EE5D),
single(0x2EE5E,0x2F7FF),
single(0x2F800,0x2FA1D),
single(0x2FA1E,0x2FA1F),
single(0x2FA20,0x2FFFD),
single(0x30000,0x3134A),
single(0x3134B,0x3134F),
single(0x31350,0x323AF),
single(0x323B0,0x3FFFD)
])
}

Expand Down
2 changes: 2 additions & 0 deletions package.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
# - support/vscode/koka.language-koka/package.json
# - whatsnew.md, readme.md

# Also update unicode asian-width list in `std/text/unicode`
# using the output of `stack exec koka -- util/update-unicode.kk -- -a`

name: koka
version: 3.0.5
Expand Down
11 changes: 11 additions & 0 deletions test/lib/unicode.kk
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// https://github.com/koka-lang/koka/issues/457
// https://github.com/koka-lang/koka/issues/458
import std/text/unicode

fun main()
// heart, variation, zero width join, fire
// ['h','i','/u2764','/uFE0F','/u200D','/U01F525']
"hi❤️‍🔥".list.println
"hi❤️‍🔥".graphemes.length.println

println(width("👾"))
3 changes: 3 additions & 0 deletions test/lib/unicode.kk.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
['h','i','/u2764','/uFE0F','/u200D','/U01F525']
3
2
45 changes: 45 additions & 0 deletions util/update-unicode.kk
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import std/os/path
import std/os/dir
import std/os/file
import std/os/process
import std/os/env
import std/os/flags

struct iflags
asian-wide: bool = False

val header = "usage:\n stack exec koka -- -e util/update-unicode [-- [options]]\n\noptions:"

val flag-descs =
fun set-asian-wide( f : iflags, b : bool ) : iflags { f(asian-wide = b) }
[ Flag( "a", ["asian-wide"], Bool(set-asian-wide), "print updated asian wide information" )]

pub fun process-flags() : <ndet,console> maybe<iflags>
val (flags,args,errs) = parse( Iflags(), flag-descs, get-args() )
if errs.is-nil && args.is-nil then Just(flags) else
println( errs.join("\n") ++ "\n" ++ flag-descs.usage(header) )
Nothing

fun main()
val flags = process-flags().unjust
if flags.asian-wide then
val file = run-system(r#"curl "https://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt" > EastAsianWidth.txt"#)
if "EastAsianWidth.txt".path.is-file then
val lines = "EastAsianWidth.txt".path.read-text-file().split("\n")
with line <- lines.foreach()
val values = line.split(";")
if length(values) >= 2 then
val width = values[1].unjust.split("#")[0].unjust.trim-right().trim-left()
if width == "W" then
val charrange = values[0].unjust.trim-right().split("..")
if length(charrange) == 2 then
println(" single(0x" ++ charrange[0].unjust ++ ",0x" ++ charrange[1].unjust ++ "),")
elif length(charrange) == 1 then
println(" single(0x" ++ charrange[0].unjust ++ ",0x" ++ charrange[0].unjust ++ "),")
else
throw("Error unsupported range " ++ charrange.show)
// else
// println("Unrecognized format " ++ line)
else
println("Could not find EastAsianWidth.txt\n\tPlease download from https://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt to your current directory")