[move-compiler] More unicode support (MystenLabs#14582)

## Description - Relaxed ASCII restriction for Move files, banning only certain control structures ## Test Plan - New tests --- If your changes are not user-facing and not a breaking change, you can skip the following section. Otherwise, please indicate what changed, and then add to the Release Notes section as highlighted during the release process. ### Type of Change (Check all that apply) - [ ] protocol change - [X] user-visible impact - [ ] breaking change for a client SDKs - [ ] breaking change for FNs (FN binary must upgrade) - [ ] breaking change for validators or node operators (must upgrade binaries) - [ ] breaking change for on-chain data layout - [ ] necessitate either a data wipe or data migration ### Release notes Move comments and byte-strings now support unicode characters. Unicode escape sequences are not yet implemented.
tnowacki · Nov 3, 2023 · 63568a7 · 63568a7
1 parent a78c82c
commit 63568a7
Show file tree

Hide file tree

Showing 16 changed files with 136 additions and 51 deletions.
diff --git a/external-crates/move/crates/move-command-line-common/src/character_sets.rs b/external-crates/move/crates/move-command-line-common/src/character_sets.rs
@@ -2,6 +2,8 @@
 // Copyright (c) The Move Contributors
 // SPDX-License-Identifier: Apache-2.0
 
+use std::fmt::{self, Display};
+
 /// Determine if a character is an allowed eye-visible (printable) character.
 ///
 /// The only allowed printable characters are the printable ascii characters (SPACE through ~) and
@@ -33,21 +35,34 @@ pub fn is_permitted_newline_crlf_chars(c1: char, c2: char) -> bool {
     is_cr && is_lf
 }
 
+// Taken from https://en.wikipedia.org/wiki/Bidirectional_text
+// TODO Double check 200F 200E 061C
+const UNICODE_TEXT_DIRECTIONALITY_CONTROL_CHARS: &[char] = &[
+    '\u{202A}', '\u{202B}', '\u{202C}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}',
+    '\u{2069}',
+];
+
+/// Is some other non-whitespace unicode character
+pub fn is_permitted_non_ascii_unicode(c: char) -> bool {
+    !c.is_ascii() && !c.is_whitespace() && !UNICODE_TEXT_DIRECTIONALITY_CONTROL_CHARS.contains(&c)
+}
+
 /// Determine if a character is permitted character.
 ///
 /// A permitted character is either a permitted printable character, or a permitted
 /// newline. Any other characters are disallowed from appearing in the file.
 pub fn is_permitted_char(c: char) -> bool {
-    is_permitted_printable_char(c) || is_permitted_newline_lf_char(c)
+    is_permitted_printable_char(c)
+        || is_permitted_newline_lf_char(c)
+        || is_permitted_non_ascii_unicode(c)
 }
 
 /// Determine if the characters is permitted characters.
 ///
 /// A permitted characters is either a permitted printable character, or a permitted
 /// newlines. Any other characters are disallowed from appearing in the file.
-pub fn is_permitted_chars(chars: &[u8], idx: usize) -> bool {
-    let c1 = chars[idx] as char;
-
+pub fn is_permitted_chars(chars: &[char], idx: usize) -> bool {
+    let c1 = chars[idx];
     if is_permitted_char(c1) {
         return true;
     }
@@ -56,18 +71,31 @@ pub fn is_permitted_chars(chars: &[u8], idx: usize) -> bool {
         return false;
     }
 
-    let c2 = chars[idx + 1] as char;
+    let c2 = chars[idx + 1];
     is_permitted_newline_crlf_chars(c1, c2)
 }
 
+/// simple wrapper for displaying characters. Escapes non-printable characters for error messages
+pub struct DisplayChar(pub char);
+
+impl Display for DisplayChar {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if self.0 == ' ' || self.0.is_alphanumeric() || self.0.is_ascii_punctuation() {
+            self.0.fmt(f)
+        } else {
+            self.0.escape_default().fmt(f)
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     #[test]
     fn test_permitted_characters() {
-        let mut good_chars = (0x20..=0x7E).collect::<Vec<u8>>();
-        good_chars.push(0x0D); // \r
-        good_chars.push(0x0A); // \n
-        good_chars.push(0x09); // \t
+        let mut good_chars = (0x20..=0x7Eu8).map(|u| u as char).collect::<Vec<char>>();
+        good_chars.push('\r');
+        good_chars.push('\n');
+        good_chars.push('\t');
 
         for idx in 0..good_chars.len() {
             assert!(super::is_permitted_chars(&good_chars, idx));
@@ -76,8 +104,8 @@ mod tests {
 
     #[test]
     fn test_forbidden_last_lf_characters() {
-        let mut good_chars = (0x20..=0x7E).collect::<Vec<u8>>();
-        good_chars.push(0x0D); // \r
+        let mut good_chars = (0x20..=0x7Eu8).map(|u| u as char).collect::<Vec<char>>();
+        good_chars.push('\r');
 
         assert!(!super::is_permitted_chars(
             &good_chars,
@@ -87,12 +115,16 @@ mod tests {
 
     #[test]
     fn test_forbidden_characters() {
-        let mut bad_chars = (0x0..0x09).collect::<Vec<u8>>();
-        bad_chars.append(&mut (0x0B..=0x1F).collect::<Vec<u8>>());
-        bad_chars.push(0x7F);
+        let mut bad_chars = (0x0..0x09u8).map(|u| u as char).collect::<Vec<char>>();
+        bad_chars.extend((0x0B..=0x1Fu8).map(|u| u as char));
+        bad_chars.push(0x7Fu8 as char);
 
         for idx in 0..bad_chars.len() {
-            assert!(!super::is_permitted_chars(&bad_chars, idx));
+            assert!(
+                !super::is_permitted_chars(&bad_chars, idx),
+                "{:X?} is permitted",
+                bad_chars[idx] as u8
+            );
         }
     }
 }
diff --git a/external-crates/move/crates/move-compiler-transactional-tests/tests/parser/byte_string.move b/external-crates/move/crates/move-compiler-transactional-tests/tests/parser/byte_string.move
@@ -5,5 +5,10 @@ fun main() {
     assert!(b"" == x"", 0);
     assert!(b"Diem" == x"4469656D", 1);
     assert!(b"\x4c\x69\x62\x72\x61" == x"4c69627261", 2);
+    assert!(
+        b"Γ ⊢ λ x. x : ∀α. α → α" ==
+        x"CE9320E28AA220CEBB20782E2078203A20E28880CEB12E20CEB120E2869220CEB1",
+        3
+    );
 }
 }
diff --git a/external-crates/move/crates/move-compiler/src/expansion/byte_string.rs b/external-crates/move/crates/move-compiler/src/expansion/byte_string.rs
@@ -134,6 +134,7 @@ fn decode_(context: &mut Context, buffer: &mut Vec<u8>, chars: Vec<char>) {
 }
 
 fn push(buffer: &mut Vec<u8>, ch: char) {
-    assert!(ch.is_ascii(), "ICE ascii-only support is gated at parsing");
-    buffer.extend(vec![ch as u8]);
+    let mut bytes = vec![0; ch.len_utf8()];
+    ch.encode_utf8(&mut bytes);
+    buffer.extend(bytes);
 }
diff --git a/external-crates/move/crates/move-compiler/src/parser/comments.rs b/external-crates/move/crates/move-compiler/src/parser/comments.rs
@@ -3,7 +3,10 @@
 // SPDX-License-Identifier: Apache-2.0
 
 use crate::{diag, diagnostics::Diagnostics};
-use move_command_line_common::{character_sets::is_permitted_chars, files::FileHash};
+use move_command_line_common::{
+    character_sets::{is_permitted_chars, DisplayChar},
+    files::FileHash,
+};
 use move_ir_types::location::*;
 use std::collections::BTreeMap;
 
@@ -15,18 +18,21 @@ pub type FileCommentMap = BTreeMap<(u32, u32), String>;
 // We restrict strings to only ascii visual characters (0x20 <= c <= 0x7E) or a permitted newline
 // character--\r--,--\n--or a tab--\t.
 pub fn verify_string(file_hash: FileHash, string: &str) -> Result<(), Diagnostics> {
-    match string
-        .chars()
+    let chars: Vec<char> = string.chars().collect();
+    match chars
+        .iter()
         .enumerate()
-        .find(|(idx, _)| !is_permitted_chars(string.as_bytes(), *idx))
+        .find(|(idx, _)| !is_permitted_chars(&chars, *idx))
     {
         None => Ok(()),
-        Some((idx, chr)) => {
+        Some((idx, c)) => {
             let loc = Loc::new(file_hash, idx as u32, idx as u32);
             let msg = format!(
-                "Invalid character '{}' found when reading file. Only ASCII printable characters, \
-                 tabs (\\t), lf (\\n) and crlf (\\r\\n) are permitted.",
-                chr
+                "Invalid character '{}' found when reading file. \
+                For ASCII, only printable characters (tabs '\\t', lf '\\n' and crlf '\\r'+'\\n') \
+                are permitted. Unicode can be used in comments and string literals, \
+                excluding certain control characters.",
+                DisplayChar(*c),
             );
             Err(Diagnostics::from(vec![diag!(
                 Syntax::InvalidCharacter,

diff --git a/external-crates/move/crates/move-compiler/src/parser/lexer.rs b/external-crates/move/crates/move-compiler/src/parser/lexer.rs
@@ -6,7 +6,7 @@ use crate::{
     diag, diagnostics::Diagnostic, editions::SyntaxEdition, parser::syntax::make_loc,
     shared::CompilationEnv, FileCommentMap, MatchedFileCommentMap,
 };
-use move_command_line_common::files::FileHash;
+use move_command_line_common::{character_sets::DisplayChar, files::FileHash};
 use move_ir_types::location::Loc;
 use std::fmt;
 
@@ -303,7 +303,8 @@ impl<'input> Lexer<'input> {
                     } else {
                         // This is a solitary '/' or '*' that isn't part of any comment delimiter.
                         // Skip over it.
-                        text = &text[1..];
+                        let c = text.chars().next().unwrap();
+                        text = &text[c.len_utf8()..];
                     }
                 }
 
@@ -578,11 +579,11 @@ fn find_token(
         '}' => (Tok::RBrace, 1),
         '#' => (Tok::NumSign, 1),
         '@' => (Tok::AtSign, 1),
-        _ => {
+        c => {
             let loc = make_loc(file_hash, start_offset, start_offset);
             return Err(Box::new(diag!(
                 Syntax::InvalidCharacter,
-                (loc, format!("Invalid character: '{}'", c))
+                (loc, format!("Unexpected character: '{}'", DisplayChar(c),))
             )));
         }
     };
@@ -645,7 +646,7 @@ fn get_string_len(text: &str) -> Option<usize> {
         } else if chr == '"' {
             return Some(pos);
         }
-        pos += 1;
+        pos += chr.len_utf8();
     }
     None
 }
@@ -751,5 +752,8 @@ mod tests {
 
         assert_eq!(trim_start_whitespace(" \r\n\r\nxxx\n"), "xxx\n");
         assert_eq!(trim_start_whitespace("\r\n \t\nxxx\r\n"), "xxx\r\n");
+        assert_eq!(trim_start_whitespace("\r\n\u{A0}\n"), "\u{A0}\n");
+        assert_eq!(trim_start_whitespace("\r\n\u{A0}\n"), "\u{A0}\n");
+        assert_eq!(trim_start_whitespace("\t  \u{0085}\n"), "\u{0085}\n")
     }
 }
diff --git a/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character.exp b/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character.exp
@@ -2,5 +2,5 @@ error[E01001]: invalid character
   ┌─ tests/move_check/parser/invalid_character.move:1:12
   │
 1 │ address 0x1~
-  │            ^ Invalid character: '~'
+  │            ^ Unexpected character: '~'
 
diff --git a/...al-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_comment.exp b/...al-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_comment.exp
diff --git a/...l-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_comment.move b/...l-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_comment.move
diff --git a/...rnal-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_emoji.exp b/...rnal-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_emoji.exp
@@ -0,0 +1,6 @@
+error[E01001]: invalid character
+  ┌─ tests/move_check/parser/invalid_character_emoji.move:5:4
+  │
+5 │    ❤️
+  │    ^ Unexpected character: '\u{2764}'
+
diff --git a/...nal-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_emoji.move b/...nal-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_emoji.move
@@ -0,0 +1,8 @@
+address 0x1 {
+
+// Check that we provide good error messages for non-ASCII characters.
+module Temp {
+   ❤️
+}
+
+}
diff --git a/...-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_non_ascii.exp b/...-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_non_ascii.exp
@@ -2,5 +2,5 @@ error[E01001]: invalid character
   ┌─ tests/move_check/parser/invalid_character_non_ascii.move:5:4
   │
 5 │    ф
-  │    ^ Invalid character 'ф' found when reading file. Only ASCII printable characters, tabs (\t), lf (\n) and crlf (\r\n) are permitted.
+  │    ^ Unexpected character: 'ф'
 
diff --git a/...crates/move/crates/move-compiler/tests/move_check/parser/non_ascii_character_comment.move b/...crates/move/crates/move-compiler/tests/move_check/parser/non_ascii_character_comment.move
@@ -0,0 +1,19 @@
+address 0x1 {
+
+// Non-ASCII characters in comments (e.g., ф) are also allowed.
+module temp {
+    /// ❤️ Also in doc comments 💝
+    public fun foo() {}
+    /* block
+    Comment
+    Γ ⊢ λ x. x : ∀α. α → α
+    */
+    public fun bar() {}
+}
+
+}
+
+module a::n {
+    public fun foo() { 0x1::temp::foo() }
+    public fun bar() { 0x1::temp::bar() }
+}
diff --git a/...-crates/move/crates/move-compiler/tests/move_check/parser/non_ascii_character_string.move b/...-crates/move/crates/move-compiler/tests/move_check/parser/non_ascii_character_string.move
@@ -0,0 +1,5 @@
+module a::m {
+    public fun foo(): vector<u8> {
+        b"Γ ⊢ λ x. x : ∀α. α → α"
+    }
+}
diff --git a/external-crates/move/crates/move-compiler/tests/move_check/parser/trojan_source.exp b/external-crates/move/crates/move-compiler/tests/move_check/parser/trojan_source.exp
@@ -0,0 +1,6 @@
+error[E01001]: invalid character
+  ┌─ tests/move_check/parser/trojan_source.move:3:24
+  │
+3 │         agent == b"user‮ ⁦// Check if user⁩ ⁦"
+  │                         Invalid character '\u{202e}' found when reading file. For ASCII, only printable characters (tabs '\t', lf '\n' and crlf '\r'+'\n') are permitted. Unicode can be used in comments and string literals, excluding certain control characters.
+
diff --git a/external-crates/move/crates/move-compiler/tests/move_check/parser/trojan_source.move b/external-crates/move/crates/move-compiler/tests/move_check/parser/trojan_source.move
@@ -0,0 +1,5 @@
+module a::m {
+    public fun is_user(agent: vector<u8>): bool {
+        agent == b"user‮ ⁦// Check if user⁩ ⁦"
+    }
+}
diff --git a/external-crates/move/crates/move-ir-to-bytecode/src/parser.rs b/external-crates/move/crates/move-ir-to-bytecode/src/parser.rs
@@ -22,15 +22,16 @@ use move_ir_types::{ast, location::*};
 // handling of characters inside of comments (usually not included as tokens) and within byte
 // array literals.
 fn verify_string(string: &str) -> Result<()> {
-    string
-        .chars()
+    let chars: Vec<char> = string.chars().collect();
+    chars
+        .iter()
         .enumerate()
-        .find(|(idx, _)| !is_permitted_chars(string.as_bytes(), *idx))
+        .find(|(idx, _)| !is_permitted_chars(&chars, *idx))
         .map_or(Ok(()), |(_, c)| {
             bail!(
-                "Parser Error: invalid character {} found when reading file.\
-                 Only ascii printable, tabs (\\t), lf (\\n) and crlf (\\r\\n) characters are permitted.",
-                c
+                "Invalid character '{c}' found when reading file. \
+                For ASCII, only printable characters (tabs '\\t', lf '\\n' and crlf '\\r'+'\\n') \
+                are permitted. Unicode can be used, excluding certain control characters.",
             )
         })
 }