From 63568a76450f1c71a181ef936a1bba98f509ddd4 Mon Sep 17 00:00:00 2001 From: Todd Nowacki Date: Fri, 3 Nov 2023 12:28:40 -0700 Subject: [PATCH] [move-compiler] More unicode support (#14582) ## Description - Relaxed ASCII restriction for Move files, banning only certain control structures ## Test Plan - New tests --- If your changes are not user-facing and not a breaking change, you can skip the following section. Otherwise, please indicate what changed, and then add to the Release Notes section as highlighted during the release process. ### Type of Change (Check all that apply) - [ ] protocol change - [X] user-visible impact - [ ] breaking change for a client SDKs - [ ] breaking change for FNs (FN binary must upgrade) - [ ] breaking change for validators or node operators (must upgrade binaries) - [ ] breaking change for on-chain data layout - [ ] necessitate either a data wipe or data migration ### Release notes Move comments and byte-strings now support unicode characters. Unicode escape sequences are not yet implemented. --- .../src/character_sets.rs | 62 ++++++++++++++----- .../tests/parser/byte_string.move | 5 ++ .../src/expansion/byte_string.rs | 5 +- .../move-compiler/src/parser/comments.rs | 22 ++++--- .../crates/move-compiler/src/parser/lexer.rs | 14 +++-- .../move_check/parser/invalid_character.exp | 2 +- .../parser/invalid_character_comment.exp | 6 -- .../parser/invalid_character_comment.move | 7 --- .../parser/invalid_character_emoji.exp | 6 ++ .../parser/invalid_character_emoji.move | 8 +++ .../parser/invalid_character_non_ascii.exp | 2 +- .../parser/non_ascii_character_comment.move | 19 ++++++ .../parser/non_ascii_character_string.move | 5 ++ .../tests/move_check/parser/trojan_source.exp | 6 ++ .../move_check/parser/trojan_source.move | 5 ++ .../crates/move-ir-to-bytecode/src/parser.rs | 13 ++-- 16 files changed, 136 insertions(+), 51 deletions(-) delete mode 100644 external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_comment.exp delete mode 100644 external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_comment.move create mode 100644 external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_emoji.exp create mode 100644 external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_emoji.move create mode 100644 external-crates/move/crates/move-compiler/tests/move_check/parser/non_ascii_character_comment.move create mode 100644 external-crates/move/crates/move-compiler/tests/move_check/parser/non_ascii_character_string.move create mode 100644 external-crates/move/crates/move-compiler/tests/move_check/parser/trojan_source.exp create mode 100644 external-crates/move/crates/move-compiler/tests/move_check/parser/trojan_source.move diff --git a/external-crates/move/crates/move-command-line-common/src/character_sets.rs b/external-crates/move/crates/move-command-line-common/src/character_sets.rs index 47223084015c4..4700c535e983d 100644 --- a/external-crates/move/crates/move-command-line-common/src/character_sets.rs +++ b/external-crates/move/crates/move-command-line-common/src/character_sets.rs @@ -2,6 +2,8 @@ // Copyright (c) The Move Contributors // SPDX-License-Identifier: Apache-2.0 +use std::fmt::{self, Display}; + /// Determine if a character is an allowed eye-visible (printable) character. /// /// The only allowed printable characters are the printable ascii characters (SPACE through ~) and @@ -33,21 +35,34 @@ pub fn is_permitted_newline_crlf_chars(c1: char, c2: char) -> bool { is_cr && is_lf } +// Taken from https://en.wikipedia.org/wiki/Bidirectional_text +// TODO Double check 200F 200E 061C +const UNICODE_TEXT_DIRECTIONALITY_CONTROL_CHARS: &[char] = &[ + '\u{202A}', '\u{202B}', '\u{202C}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}', + '\u{2069}', +]; + +/// Is some other non-whitespace unicode character +pub fn is_permitted_non_ascii_unicode(c: char) -> bool { + !c.is_ascii() && !c.is_whitespace() && !UNICODE_TEXT_DIRECTIONALITY_CONTROL_CHARS.contains(&c) +} + /// Determine if a character is permitted character. /// /// A permitted character is either a permitted printable character, or a permitted /// newline. Any other characters are disallowed from appearing in the file. pub fn is_permitted_char(c: char) -> bool { - is_permitted_printable_char(c) || is_permitted_newline_lf_char(c) + is_permitted_printable_char(c) + || is_permitted_newline_lf_char(c) + || is_permitted_non_ascii_unicode(c) } /// Determine if the characters is permitted characters. /// /// A permitted characters is either a permitted printable character, or a permitted /// newlines. Any other characters are disallowed from appearing in the file. -pub fn is_permitted_chars(chars: &[u8], idx: usize) -> bool { - let c1 = chars[idx] as char; - +pub fn is_permitted_chars(chars: &[char], idx: usize) -> bool { + let c1 = chars[idx]; if is_permitted_char(c1) { return true; } @@ -56,18 +71,31 @@ pub fn is_permitted_chars(chars: &[u8], idx: usize) -> bool { return false; } - let c2 = chars[idx + 1] as char; + let c2 = chars[idx + 1]; is_permitted_newline_crlf_chars(c1, c2) } +/// simple wrapper for displaying characters. Escapes non-printable characters for error messages +pub struct DisplayChar(pub char); + +impl Display for DisplayChar { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.0 == ' ' || self.0.is_alphanumeric() || self.0.is_ascii_punctuation() { + self.0.fmt(f) + } else { + self.0.escape_default().fmt(f) + } + } +} + #[cfg(test)] mod tests { #[test] fn test_permitted_characters() { - let mut good_chars = (0x20..=0x7E).collect::>(); - good_chars.push(0x0D); // \r - good_chars.push(0x0A); // \n - good_chars.push(0x09); // \t + let mut good_chars = (0x20..=0x7Eu8).map(|u| u as char).collect::>(); + good_chars.push('\r'); + good_chars.push('\n'); + good_chars.push('\t'); for idx in 0..good_chars.len() { assert!(super::is_permitted_chars(&good_chars, idx)); @@ -76,8 +104,8 @@ mod tests { #[test] fn test_forbidden_last_lf_characters() { - let mut good_chars = (0x20..=0x7E).collect::>(); - good_chars.push(0x0D); // \r + let mut good_chars = (0x20..=0x7Eu8).map(|u| u as char).collect::>(); + good_chars.push('\r'); assert!(!super::is_permitted_chars( &good_chars, @@ -87,12 +115,16 @@ mod tests { #[test] fn test_forbidden_characters() { - let mut bad_chars = (0x0..0x09).collect::>(); - bad_chars.append(&mut (0x0B..=0x1F).collect::>()); - bad_chars.push(0x7F); + let mut bad_chars = (0x0..0x09u8).map(|u| u as char).collect::>(); + bad_chars.extend((0x0B..=0x1Fu8).map(|u| u as char)); + bad_chars.push(0x7Fu8 as char); for idx in 0..bad_chars.len() { - assert!(!super::is_permitted_chars(&bad_chars, idx)); + assert!( + !super::is_permitted_chars(&bad_chars, idx), + "{:X?} is permitted", + bad_chars[idx] as u8 + ); } } } diff --git a/external-crates/move/crates/move-compiler-transactional-tests/tests/parser/byte_string.move b/external-crates/move/crates/move-compiler-transactional-tests/tests/parser/byte_string.move index 06d72a18b3c44..7b1e9aacdc6f2 100644 --- a/external-crates/move/crates/move-compiler-transactional-tests/tests/parser/byte_string.move +++ b/external-crates/move/crates/move-compiler-transactional-tests/tests/parser/byte_string.move @@ -5,5 +5,10 @@ fun main() { assert!(b"" == x"", 0); assert!(b"Diem" == x"4469656D", 1); assert!(b"\x4c\x69\x62\x72\x61" == x"4c69627261", 2); + assert!( + b"Γ ⊢ λ x. x : ∀α. α → α" == + x"CE9320E28AA220CEBB20782E2078203A20E28880CEB12E20CEB120E2869220CEB1", + 3 + ); } } diff --git a/external-crates/move/crates/move-compiler/src/expansion/byte_string.rs b/external-crates/move/crates/move-compiler/src/expansion/byte_string.rs index 56379811a3070..635b437222c27 100644 --- a/external-crates/move/crates/move-compiler/src/expansion/byte_string.rs +++ b/external-crates/move/crates/move-compiler/src/expansion/byte_string.rs @@ -134,6 +134,7 @@ fn decode_(context: &mut Context, buffer: &mut Vec, chars: Vec) { } fn push(buffer: &mut Vec, ch: char) { - assert!(ch.is_ascii(), "ICE ascii-only support is gated at parsing"); - buffer.extend(vec![ch as u8]); + let mut bytes = vec![0; ch.len_utf8()]; + ch.encode_utf8(&mut bytes); + buffer.extend(bytes); } diff --git a/external-crates/move/crates/move-compiler/src/parser/comments.rs b/external-crates/move/crates/move-compiler/src/parser/comments.rs index aa751ce31b801..fc675403f7fa5 100644 --- a/external-crates/move/crates/move-compiler/src/parser/comments.rs +++ b/external-crates/move/crates/move-compiler/src/parser/comments.rs @@ -3,7 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 use crate::{diag, diagnostics::Diagnostics}; -use move_command_line_common::{character_sets::is_permitted_chars, files::FileHash}; +use move_command_line_common::{ + character_sets::{is_permitted_chars, DisplayChar}, + files::FileHash, +}; use move_ir_types::location::*; use std::collections::BTreeMap; @@ -15,18 +18,21 @@ pub type FileCommentMap = BTreeMap<(u32, u32), String>; // We restrict strings to only ascii visual characters (0x20 <= c <= 0x7E) or a permitted newline // character--\r--,--\n--or a tab--\t. pub fn verify_string(file_hash: FileHash, string: &str) -> Result<(), Diagnostics> { - match string - .chars() + let chars: Vec = string.chars().collect(); + match chars + .iter() .enumerate() - .find(|(idx, _)| !is_permitted_chars(string.as_bytes(), *idx)) + .find(|(idx, _)| !is_permitted_chars(&chars, *idx)) { None => Ok(()), - Some((idx, chr)) => { + Some((idx, c)) => { let loc = Loc::new(file_hash, idx as u32, idx as u32); let msg = format!( - "Invalid character '{}' found when reading file. Only ASCII printable characters, \ - tabs (\\t), lf (\\n) and crlf (\\r\\n) are permitted.", - chr + "Invalid character '{}' found when reading file. \ + For ASCII, only printable characters (tabs '\\t', lf '\\n' and crlf '\\r'+'\\n') \ + are permitted. Unicode can be used in comments and string literals, \ + excluding certain control characters.", + DisplayChar(*c), ); Err(Diagnostics::from(vec![diag!( Syntax::InvalidCharacter, diff --git a/external-crates/move/crates/move-compiler/src/parser/lexer.rs b/external-crates/move/crates/move-compiler/src/parser/lexer.rs index f7d5c327b1a4a..8a1964ce0ef5d 100644 --- a/external-crates/move/crates/move-compiler/src/parser/lexer.rs +++ b/external-crates/move/crates/move-compiler/src/parser/lexer.rs @@ -6,7 +6,7 @@ use crate::{ diag, diagnostics::Diagnostic, editions::SyntaxEdition, parser::syntax::make_loc, shared::CompilationEnv, FileCommentMap, MatchedFileCommentMap, }; -use move_command_line_common::files::FileHash; +use move_command_line_common::{character_sets::DisplayChar, files::FileHash}; use move_ir_types::location::Loc; use std::fmt; @@ -303,7 +303,8 @@ impl<'input> Lexer<'input> { } else { // This is a solitary '/' or '*' that isn't part of any comment delimiter. // Skip over it. - text = &text[1..]; + let c = text.chars().next().unwrap(); + text = &text[c.len_utf8()..]; } } @@ -578,11 +579,11 @@ fn find_token( '}' => (Tok::RBrace, 1), '#' => (Tok::NumSign, 1), '@' => (Tok::AtSign, 1), - _ => { + c => { let loc = make_loc(file_hash, start_offset, start_offset); return Err(Box::new(diag!( Syntax::InvalidCharacter, - (loc, format!("Invalid character: '{}'", c)) + (loc, format!("Unexpected character: '{}'", DisplayChar(c),)) ))); } }; @@ -645,7 +646,7 @@ fn get_string_len(text: &str) -> Option { } else if chr == '"' { return Some(pos); } - pos += 1; + pos += chr.len_utf8(); } None } @@ -751,5 +752,8 @@ mod tests { assert_eq!(trim_start_whitespace(" \r\n\r\nxxx\n"), "xxx\n"); assert_eq!(trim_start_whitespace("\r\n \t\nxxx\r\n"), "xxx\r\n"); + assert_eq!(trim_start_whitespace("\r\n\u{A0}\n"), "\u{A0}\n"); + assert_eq!(trim_start_whitespace("\r\n\u{A0}\n"), "\u{A0}\n"); + assert_eq!(trim_start_whitespace("\t \u{0085}\n"), "\u{0085}\n") } } diff --git a/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character.exp b/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character.exp index 77e8fdfbbdbd7..ccae2a21c03c1 100644 --- a/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character.exp +++ b/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character.exp @@ -2,5 +2,5 @@ error[E01001]: invalid character ┌─ tests/move_check/parser/invalid_character.move:1:12 │ 1 │ address 0x1~ - │ ^ Invalid character: '~' + │ ^ Unexpected character: '~' diff --git a/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_comment.exp b/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_comment.exp deleted file mode 100644 index aa375653edbf1..0000000000000 --- a/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_comment.exp +++ /dev/null @@ -1,6 +0,0 @@ -error[E01001]: invalid character - ┌─ tests/move_check/parser/invalid_character_comment.move:3:44 - │ -3 │ // Non-ASCII characters in comments (e.g., ф) are also not allowed. - │ ^ Invalid character 'ф' found when reading file. Only ASCII printable characters, tabs (\t), lf (\n) and crlf (\r\n) are permitted. - diff --git a/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_comment.move b/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_comment.move deleted file mode 100644 index 2491cf56b176f..0000000000000 --- a/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_comment.move +++ /dev/null @@ -1,7 +0,0 @@ -address 0x1 { - -// Non-ASCII characters in comments (e.g., ф) are also not allowed. -module Temp { -} - -} diff --git a/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_emoji.exp b/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_emoji.exp new file mode 100644 index 0000000000000..b8aba9dd39d75 --- /dev/null +++ b/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_emoji.exp @@ -0,0 +1,6 @@ +error[E01001]: invalid character + ┌─ tests/move_check/parser/invalid_character_emoji.move:5:4 + │ +5 │ ❤️ + │ ^ Unexpected character: '\u{2764}' + diff --git a/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_emoji.move b/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_emoji.move new file mode 100644 index 0000000000000..d8f4ce77e0cdd --- /dev/null +++ b/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_emoji.move @@ -0,0 +1,8 @@ +address 0x1 { + +// Check that we provide good error messages for non-ASCII characters. +module Temp { + ❤️ +} + +} diff --git a/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_non_ascii.exp b/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_non_ascii.exp index 3ffa9e0d47d2c..573f68255b7f3 100644 --- a/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_non_ascii.exp +++ b/external-crates/move/crates/move-compiler/tests/move_check/parser/invalid_character_non_ascii.exp @@ -2,5 +2,5 @@ error[E01001]: invalid character ┌─ tests/move_check/parser/invalid_character_non_ascii.move:5:4 │ 5 │ ф - │ ^ Invalid character 'ф' found when reading file. Only ASCII printable characters, tabs (\t), lf (\n) and crlf (\r\n) are permitted. + │ ^ Unexpected character: 'ф' diff --git a/external-crates/move/crates/move-compiler/tests/move_check/parser/non_ascii_character_comment.move b/external-crates/move/crates/move-compiler/tests/move_check/parser/non_ascii_character_comment.move new file mode 100644 index 0000000000000..3029d52ef533e --- /dev/null +++ b/external-crates/move/crates/move-compiler/tests/move_check/parser/non_ascii_character_comment.move @@ -0,0 +1,19 @@ +address 0x1 { + +// Non-ASCII characters in comments (e.g., ф) are also allowed. +module temp { + /// ❤️ Also in doc comments 💝 + public fun foo() {} + /* block + Comment + Γ ⊢ λ x. x : ∀α. α → α + */ + public fun bar() {} +} + +} + +module a::n { + public fun foo() { 0x1::temp::foo() } + public fun bar() { 0x1::temp::bar() } +} diff --git a/external-crates/move/crates/move-compiler/tests/move_check/parser/non_ascii_character_string.move b/external-crates/move/crates/move-compiler/tests/move_check/parser/non_ascii_character_string.move new file mode 100644 index 0000000000000..92aa9fd74b578 --- /dev/null +++ b/external-crates/move/crates/move-compiler/tests/move_check/parser/non_ascii_character_string.move @@ -0,0 +1,5 @@ +module a::m { + public fun foo(): vector { + b"Γ ⊢ λ x. x : ∀α. α → α" + } +} diff --git a/external-crates/move/crates/move-compiler/tests/move_check/parser/trojan_source.exp b/external-crates/move/crates/move-compiler/tests/move_check/parser/trojan_source.exp new file mode 100644 index 0000000000000..b717706baaddb --- /dev/null +++ b/external-crates/move/crates/move-compiler/tests/move_check/parser/trojan_source.exp @@ -0,0 +1,6 @@ +error[E01001]: invalid character + ┌─ tests/move_check/parser/trojan_source.move:3:24 + │ +3 │ agent == b"user‮ ⁦// Check if user⁩ ⁦" + │ Invalid character '\u{202e}' found when reading file. For ASCII, only printable characters (tabs '\t', lf '\n' and crlf '\r'+'\n') are permitted. Unicode can be used in comments and string literals, excluding certain control characters. + diff --git a/external-crates/move/crates/move-compiler/tests/move_check/parser/trojan_source.move b/external-crates/move/crates/move-compiler/tests/move_check/parser/trojan_source.move new file mode 100644 index 0000000000000..bd14dfffc6cda --- /dev/null +++ b/external-crates/move/crates/move-compiler/tests/move_check/parser/trojan_source.move @@ -0,0 +1,5 @@ +module a::m { + public fun is_user(agent: vector): bool { + agent == b"user‮ ⁦// Check if user⁩ ⁦" + } +} diff --git a/external-crates/move/crates/move-ir-to-bytecode/src/parser.rs b/external-crates/move/crates/move-ir-to-bytecode/src/parser.rs index b8afc99901d4f..436094a266d1f 100644 --- a/external-crates/move/crates/move-ir-to-bytecode/src/parser.rs +++ b/external-crates/move/crates/move-ir-to-bytecode/src/parser.rs @@ -22,15 +22,16 @@ use move_ir_types::{ast, location::*}; // handling of characters inside of comments (usually not included as tokens) and within byte // array literals. fn verify_string(string: &str) -> Result<()> { - string - .chars() + let chars: Vec = string.chars().collect(); + chars + .iter() .enumerate() - .find(|(idx, _)| !is_permitted_chars(string.as_bytes(), *idx)) + .find(|(idx, _)| !is_permitted_chars(&chars, *idx)) .map_or(Ok(()), |(_, c)| { bail!( - "Parser Error: invalid character {} found when reading file.\ - Only ascii printable, tabs (\\t), lf (\\n) and crlf (\\r\\n) characters are permitted.", - c + "Invalid character '{c}' found when reading file. \ + For ASCII, only printable characters (tabs '\\t', lf '\\n' and crlf '\\r'+'\\n') \ + are permitted. Unicode can be used, excluding certain control characters.", ) }) }