Skip to content

Commit

Permalink
[move-compiler] More unicode support (MystenLabs#14582)
Browse files Browse the repository at this point in the history
## Description 

- Relaxed ASCII restriction for Move files, banning only certain control
structures

## Test Plan 

- New tests 

---
If your changes are not user-facing and not a breaking change, you can
skip the following section. Otherwise, please indicate what changed, and
then add to the Release Notes section as highlighted during the release
process.

### Type of Change (Check all that apply)

- [ ] protocol change
- [X] user-visible impact
- [ ] breaking change for a client SDKs
- [ ] breaking change for FNs (FN binary must upgrade)
- [ ] breaking change for validators or node operators (must upgrade
binaries)
- [ ] breaking change for on-chain data layout
- [ ] necessitate either a data wipe or data migration

### Release notes

Move comments and byte-strings now support unicode characters. Unicode
escape sequences are not yet implemented.
  • Loading branch information
tnowacki authored Nov 3, 2023
1 parent a78c82c commit 63568a7
Show file tree
Hide file tree
Showing 16 changed files with 136 additions and 51 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// Copyright (c) The Move Contributors
// SPDX-License-Identifier: Apache-2.0

use std::fmt::{self, Display};

/// Determine if a character is an allowed eye-visible (printable) character.
///
/// The only allowed printable characters are the printable ascii characters (SPACE through ~) and
Expand Down Expand Up @@ -33,21 +35,34 @@ pub fn is_permitted_newline_crlf_chars(c1: char, c2: char) -> bool {
is_cr && is_lf
}

// Taken from https://en.wikipedia.org/wiki/Bidirectional_text
// TODO Double check 200F 200E 061C
const UNICODE_TEXT_DIRECTIONALITY_CONTROL_CHARS: &[char] = &[
'\u{202A}', '\u{202B}', '\u{202C}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}',
'\u{2069}',
];

/// Is some other non-whitespace unicode character
pub fn is_permitted_non_ascii_unicode(c: char) -> bool {
!c.is_ascii() && !c.is_whitespace() && !UNICODE_TEXT_DIRECTIONALITY_CONTROL_CHARS.contains(&c)
}

/// Determine if a character is permitted character.
///
/// A permitted character is either a permitted printable character, or a permitted
/// newline. Any other characters are disallowed from appearing in the file.
pub fn is_permitted_char(c: char) -> bool {
is_permitted_printable_char(c) || is_permitted_newline_lf_char(c)
is_permitted_printable_char(c)
|| is_permitted_newline_lf_char(c)
|| is_permitted_non_ascii_unicode(c)
}

/// Determine if the characters is permitted characters.
///
/// A permitted characters is either a permitted printable character, or a permitted
/// newlines. Any other characters are disallowed from appearing in the file.
pub fn is_permitted_chars(chars: &[u8], idx: usize) -> bool {
let c1 = chars[idx] as char;

pub fn is_permitted_chars(chars: &[char], idx: usize) -> bool {
let c1 = chars[idx];
if is_permitted_char(c1) {
return true;
}
Expand All @@ -56,18 +71,31 @@ pub fn is_permitted_chars(chars: &[u8], idx: usize) -> bool {
return false;
}

let c2 = chars[idx + 1] as char;
let c2 = chars[idx + 1];
is_permitted_newline_crlf_chars(c1, c2)
}

/// simple wrapper for displaying characters. Escapes non-printable characters for error messages
pub struct DisplayChar(pub char);

impl Display for DisplayChar {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.0 == ' ' || self.0.is_alphanumeric() || self.0.is_ascii_punctuation() {
self.0.fmt(f)
} else {
self.0.escape_default().fmt(f)
}
}
}

#[cfg(test)]
mod tests {
#[test]
fn test_permitted_characters() {
let mut good_chars = (0x20..=0x7E).collect::<Vec<u8>>();
good_chars.push(0x0D); // \r
good_chars.push(0x0A); // \n
good_chars.push(0x09); // \t
let mut good_chars = (0x20..=0x7Eu8).map(|u| u as char).collect::<Vec<char>>();
good_chars.push('\r');
good_chars.push('\n');
good_chars.push('\t');

for idx in 0..good_chars.len() {
assert!(super::is_permitted_chars(&good_chars, idx));
Expand All @@ -76,8 +104,8 @@ mod tests {

#[test]
fn test_forbidden_last_lf_characters() {
let mut good_chars = (0x20..=0x7E).collect::<Vec<u8>>();
good_chars.push(0x0D); // \r
let mut good_chars = (0x20..=0x7Eu8).map(|u| u as char).collect::<Vec<char>>();
good_chars.push('\r');

assert!(!super::is_permitted_chars(
&good_chars,
Expand All @@ -87,12 +115,16 @@ mod tests {

#[test]
fn test_forbidden_characters() {
let mut bad_chars = (0x0..0x09).collect::<Vec<u8>>();
bad_chars.append(&mut (0x0B..=0x1F).collect::<Vec<u8>>());
bad_chars.push(0x7F);
let mut bad_chars = (0x0..0x09u8).map(|u| u as char).collect::<Vec<char>>();
bad_chars.extend((0x0B..=0x1Fu8).map(|u| u as char));
bad_chars.push(0x7Fu8 as char);

for idx in 0..bad_chars.len() {
assert!(!super::is_permitted_chars(&bad_chars, idx));
assert!(
!super::is_permitted_chars(&bad_chars, idx),
"{:X?} is permitted",
bad_chars[idx] as u8
);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,10 @@ fun main() {
assert!(b"" == x"", 0);
assert!(b"Diem" == x"4469656D", 1);
assert!(b"\x4c\x69\x62\x72\x61" == x"4c69627261", 2);
assert!(
b"Γ λ x. x : ∀α. α α" ==
x"CE9320E28AA220CEBB20782E2078203A20E28880CEB12E20CEB120E2869220CEB1",
3
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ fn decode_(context: &mut Context, buffer: &mut Vec<u8>, chars: Vec<char>) {
}

fn push(buffer: &mut Vec<u8>, ch: char) {
assert!(ch.is_ascii(), "ICE ascii-only support is gated at parsing");
buffer.extend(vec![ch as u8]);
let mut bytes = vec![0; ch.len_utf8()];
ch.encode_utf8(&mut bytes);
buffer.extend(bytes);
}
22 changes: 14 additions & 8 deletions external-crates/move/crates/move-compiler/src/parser/comments.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
// SPDX-License-Identifier: Apache-2.0

use crate::{diag, diagnostics::Diagnostics};
use move_command_line_common::{character_sets::is_permitted_chars, files::FileHash};
use move_command_line_common::{
character_sets::{is_permitted_chars, DisplayChar},
files::FileHash,
};
use move_ir_types::location::*;
use std::collections::BTreeMap;

Expand All @@ -15,18 +18,21 @@ pub type FileCommentMap = BTreeMap<(u32, u32), String>;
// We restrict strings to only ascii visual characters (0x20 <= c <= 0x7E) or a permitted newline
// character--\r--,--\n--or a tab--\t.
pub fn verify_string(file_hash: FileHash, string: &str) -> Result<(), Diagnostics> {
match string
.chars()
let chars: Vec<char> = string.chars().collect();
match chars
.iter()
.enumerate()
.find(|(idx, _)| !is_permitted_chars(string.as_bytes(), *idx))
.find(|(idx, _)| !is_permitted_chars(&chars, *idx))
{
None => Ok(()),
Some((idx, chr)) => {
Some((idx, c)) => {
let loc = Loc::new(file_hash, idx as u32, idx as u32);
let msg = format!(
"Invalid character '{}' found when reading file. Only ASCII printable characters, \
tabs (\\t), lf (\\n) and crlf (\\r\\n) are permitted.",
chr
"Invalid character '{}' found when reading file. \
For ASCII, only printable characters (tabs '\\t', lf '\\n' and crlf '\\r'+'\\n') \
are permitted. Unicode can be used in comments and string literals, \
excluding certain control characters.",
DisplayChar(*c),
);
Err(Diagnostics::from(vec![diag!(
Syntax::InvalidCharacter,
Expand Down
14 changes: 9 additions & 5 deletions external-crates/move/crates/move-compiler/src/parser/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::{
diag, diagnostics::Diagnostic, editions::SyntaxEdition, parser::syntax::make_loc,
shared::CompilationEnv, FileCommentMap, MatchedFileCommentMap,
};
use move_command_line_common::files::FileHash;
use move_command_line_common::{character_sets::DisplayChar, files::FileHash};
use move_ir_types::location::Loc;
use std::fmt;

Expand Down Expand Up @@ -303,7 +303,8 @@ impl<'input> Lexer<'input> {
} else {
// This is a solitary '/' or '*' that isn't part of any comment delimiter.
// Skip over it.
text = &text[1..];
let c = text.chars().next().unwrap();
text = &text[c.len_utf8()..];
}
}

Expand Down Expand Up @@ -578,11 +579,11 @@ fn find_token(
'}' => (Tok::RBrace, 1),
'#' => (Tok::NumSign, 1),
'@' => (Tok::AtSign, 1),
_ => {
c => {
let loc = make_loc(file_hash, start_offset, start_offset);
return Err(Box::new(diag!(
Syntax::InvalidCharacter,
(loc, format!("Invalid character: '{}'", c))
(loc, format!("Unexpected character: '{}'", DisplayChar(c),))
)));
}
};
Expand Down Expand Up @@ -645,7 +646,7 @@ fn get_string_len(text: &str) -> Option<usize> {
} else if chr == '"' {
return Some(pos);
}
pos += 1;
pos += chr.len_utf8();
}
None
}
Expand Down Expand Up @@ -751,5 +752,8 @@ mod tests {

assert_eq!(trim_start_whitespace(" \r\n\r\nxxx\n"), "xxx\n");
assert_eq!(trim_start_whitespace("\r\n \t\nxxx\r\n"), "xxx\r\n");
assert_eq!(trim_start_whitespace("\r\n\u{A0}\n"), "\u{A0}\n");
assert_eq!(trim_start_whitespace("\r\n\u{A0}\n"), "\u{A0}\n");
assert_eq!(trim_start_whitespace("\t \u{0085}\n"), "\u{0085}\n")
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ error[E01001]: invalid character
┌─ tests/move_check/parser/invalid_character.move:1:12
1 │ address 0x1~
│ ^ Invalid character: '~'
│ ^ Unexpected character: '~'

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
error[E01001]: invalid character
┌─ tests/move_check/parser/invalid_character_emoji.move:5:4
5 │ ❤️
│ ^ Unexpected character: '\u{2764}'

Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
address 0x1 {

// Check that we provide good error messages for non-ASCII characters.
module Temp {
❤️
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ error[E01001]: invalid character
┌─ tests/move_check/parser/invalid_character_non_ascii.move:5:4
5 │ ф
│ ^ Invalid character 'ф' found when reading file. Only ASCII printable characters, tabs (\t), lf (\n) and crlf (\r\n) are permitted.
│ ^ Unexpected character: 'ф'

Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
address 0x1 {

// Non-ASCII characters in comments (e.g., ф) are also allowed.
module temp {
/// ❤️ Also in doc comments 💝
public fun foo() {}
/* block
Comment
Γ ⊢ λ x. x : ∀α. α → α
*/
public fun bar() {}
}

}

module a::n {
public fun foo() { 0x1::temp::foo() }
public fun bar() { 0x1::temp::bar() }
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
module a::m {
public fun foo(): vector<u8> {
b"Γ λ x. x : ∀α. α α"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
error[E01001]: invalid character
┌─ tests/move_check/parser/trojan_source.move:3:24
3 │ agent == b"user‮ ⁦// Check if user⁩ ⁦"
│ Invalid character '\u{202e}' found when reading file. For ASCII, only printable characters (tabs '\t', lf '\n' and crlf '\r'+'\n') are permitted. Unicode can be used in comments and string literals, excluding certain control characters.

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
module a::m {
public fun is_user(agent: vector<u8>): bool {
agent == b"user // Check if user ⁦"
}
}
13 changes: 7 additions & 6 deletions external-crates/move/crates/move-ir-to-bytecode/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,16 @@ use move_ir_types::{ast, location::*};
// handling of characters inside of comments (usually not included as tokens) and within byte
// array literals.
fn verify_string(string: &str) -> Result<()> {
string
.chars()
let chars: Vec<char> = string.chars().collect();
chars
.iter()
.enumerate()
.find(|(idx, _)| !is_permitted_chars(string.as_bytes(), *idx))
.find(|(idx, _)| !is_permitted_chars(&chars, *idx))
.map_or(Ok(()), |(_, c)| {
bail!(
"Parser Error: invalid character {} found when reading file.\
Only ascii printable, tabs (\\t), lf (\\n) and crlf (\\r\\n) characters are permitted.",
c
"Invalid character '{c}' found when reading file. \
For ASCII, only printable characters (tabs '\\t', lf '\\n' and crlf '\\r'+'\\n') \
are permitted. Unicode can be used, excluding certain control characters.",
)
})
}
Expand Down

0 comments on commit 63568a7

Please sign in to comment.