From 473063cc26162af14c7823e5a37d4911281f582c Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Sat, 23 Nov 2024 10:24:17 -0500 Subject: [PATCH] LibUnicode+LibWeb: Rework `bidirectional_class()` API a bit Before this commit, GenerateUnicodeData generated a `BidirectionalClass` enum directly from UnicodeData.txt. Now, GenerateUnicodeData instead generates `BidirectionalClassInternal` and CharacterTypes.h has an explicit enum `BidiClass` with nicer names -- no underscores, but also names more similar to ICUs names. (Since CharacterTypes.h is included in some of LibUnicode's generators, it can't refer to generated `BidirectionalClassInternal`, so we instead have a big manual mapping switch in UnicodeData.cpp.) `bidirectional_class()` used to return an `Optional` for when unicode data was disabled. Now we return `BidiClass` and just return `BidiClass::LeftToRight` instead of making every client do this. It also updates LibWeb's Element.cpp to this new system. Also remove now-unused `bidirectional_class_from_string()`. This kind of cherry-picks the 4th commit of LadybirdBrowser/ladybird#239 (aa3a30870b58c47cb37bce1418d7e6bee7af71d9): The CharacterTypes.h, Element.cpp and TestUnicodeCharacterTypes.cpp changes are by trflynn. Co-authored-by: Tim Flynn --- .../LibUnicode/GenerateUnicodeData.cpp | 10 ++- .../LibUnicode/TestUnicodeCharacterTypes.cpp | 24 ++----- .../Libraries/LibUnicode/CharacterTypes.cpp | 62 ++++++++++++++++++- .../Libraries/LibUnicode/CharacterTypes.h | 32 +++++++++- Userland/Libraries/LibUnicode/Forward.h | 2 +- Userland/Libraries/LibWeb/DOM/Element.cpp | 20 ++---- 6 files changed, 107 insertions(+), 43 deletions(-) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index 35e59a76765dbd..7dbed005d0b609 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -843,7 +843,7 @@ namespace Unicode { generate_enum("WordBreakProperty"sv, {}, unicode_data.word_break_props.keys()); generate_enum("SentenceBreakProperty"sv, {}, unicode_data.sentence_break_props.keys()); generate_enum("CompatibilityFormattingTag"sv, "Canonical"sv, unicode_data.compatibility_tags); - generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values(), unicode_data.bidirectional_class_aliases); + generate_enum("BidirectionalClassInternal"sv, {}, unicode_data.bidirectional_classes.values(), unicode_data.bidirectional_class_aliases); generator.append(R"~~~( struct SpecialCasing { @@ -1038,7 +1038,7 @@ struct CodePointNameComparator : public CodePointRangeComparator { struct BidiClassData { CodePointRange code_point_range {}; - BidirectionalClass bidi_class {}; + BidirectionalClassInternal bidi_class {}; }; struct CodePointBidiClassComparator : public CodePointRangeComparator { @@ -1303,7 +1303,7 @@ static constexpr Array s_bidirectional_classes { { generator.set("first", ByteString::formatted("{:#x}", data.code_point_range.first)); generator.set("last", ByteString::formatted("{:#x}", data.code_point_range.last)); generator.set("bidi_class", data.bidi_class); - generator.append("{ { @first@, @last@ }, BidirectionalClass::@bidi_class@ }"); + generator.append("{ { @first@, @last@ }, BidirectionalClassInternal::@bidi_class@ }"); if (bidi_classes_in_current_row == max_bidi_classes_per_row) { bidi_classes_in_current_row = 0; @@ -1445,7 +1445,7 @@ Optional code_point_composition(u32 first_code_point, u32 second_code_point return {}; } -Optional bidirectional_class(u32 code_point) +Optional bidirectional_class_internal(u32 code_point) { if (auto const* entry = binary_search(s_bidirectional_classes, code_point, nullptr, CodePointBidiClassComparator {})) return entry->bidi_class; @@ -1513,8 +1513,6 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@) TRY(append_prop_search("WordBreakProperty"sv, "word_break_property"sv, "s_word_break_properties"sv)); TRY(append_prop_search("SentenceBreakProperty"sv, "sentence_break_property"sv, "s_sentence_break_properties"sv)); - TRY(append_from_string("BidirectionalClass"sv, "bidirectional_class"sv, unicode_data.bidirectional_classes, {})); - generator.append(R"~~~( } )~~~"); diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp index ff35dedc747b50..8a91485799d810 100644 --- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -416,25 +416,13 @@ TEST_CASE(code_point_display_name) TEST_CASE(code_point_bidirectional_character_type) { - auto code_point_bidi_class = [](u32 code_point) { - auto bidi_class = Unicode::bidirectional_class(code_point); - VERIFY(bidi_class.has_value()); - return bidi_class.release_value(); - }; - - auto bidi_class_from_string = [](StringView name) { - auto result = Unicode::bidirectional_class_from_string(name); - VERIFY(result.has_value()); - return result.release_value(); - }; - // Left-to-right - EXPECT_EQ(code_point_bidi_class('A'), bidi_class_from_string("L"sv)); - EXPECT_EQ(code_point_bidi_class('z'), bidi_class_from_string("L"sv)); + EXPECT_EQ(Unicode::bidirectional_class('A'), Unicode::BidiClass::LeftToRight); + EXPECT_EQ(Unicode::bidirectional_class('z'), Unicode::BidiClass::LeftToRight); // European number - EXPECT_EQ(code_point_bidi_class('7'), bidi_class_from_string("EN"sv)); + EXPECT_EQ(Unicode::bidirectional_class('7'), Unicode::BidiClass::EuropeanNumber); // Whitespace - EXPECT_EQ(code_point_bidi_class(' '), bidi_class_from_string("WS"sv)); + EXPECT_EQ(Unicode::bidirectional_class(' '), Unicode::BidiClass::WhiteSpaceNeutral); // Arabic right-to-left (U+FEB4 ARABIC LETTER SEEN MEDIAL FORM) - EXPECT_EQ(code_point_bidi_class(0xFEB4), bidi_class_from_string("AL"sv)); + EXPECT_EQ(Unicode::bidirectional_class(0xFEB4), Unicode::BidiClass::RightToLeftArabic); } diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index b6a5d81f0fabd5..c2fca6831b4a67 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -228,7 +228,65 @@ bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeB bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; } bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; } -Optional __attribute__((weak)) bidirectional_class_from_string(StringView) { return {}; } -Optional __attribute__((weak)) bidirectional_class(u32) { return {}; } +Optional __attribute__((weak)) bidirectional_class_internal(u32) { return {}; } + +static constexpr BidiClass bidi_class_from_bidirectional_class_interal(BidirectionalClassInternal direction) +{ + switch (direction) { + case BidirectionalClassInternal::Arabic_Number: + return BidiClass::ArabicNumber; + case BidirectionalClassInternal::Paragraph_Separator: + return BidiClass::BlockSeparator; + case BidirectionalClassInternal::Boundary_Neutral: + return BidiClass::BoundaryNeutral; + case BidirectionalClassInternal::Common_Separator: + return BidiClass::CommonNumberSeparator; + case BidirectionalClassInternal::Nonspacing_Mark: + return BidiClass::DirNonSpacingMark; + case BidirectionalClassInternal::European_Number: + return BidiClass::EuropeanNumber; + case BidirectionalClassInternal::European_Separator: + return BidiClass::EuropeanNumberSeparator; + case BidirectionalClassInternal::European_Terminator: + return BidiClass::EuropeanNumberTerminator; + case BidirectionalClassInternal::First_Strong_Isolate: + return BidiClass::FirstStrongIsolate; + case BidirectionalClassInternal::Left_To_Right: + return BidiClass::LeftToRight; + case BidirectionalClassInternal::Left_To_Right_Embedding: + return BidiClass::LeftToRightEmbedding; + case BidirectionalClassInternal::Left_To_Right_Isolate: + return BidiClass::LeftToRightIsolate; + case BidirectionalClassInternal::Left_To_Right_Override: + return BidiClass::LeftToRightOverride; + case BidirectionalClassInternal::Other_Neutral: + return BidiClass::OtherNeutral; + case BidirectionalClassInternal::Pop_Directional_Format: + return BidiClass::PopDirectionalFormat; + case BidirectionalClassInternal::Pop_Directional_Isolate: + return BidiClass::PopDirectionalIsolate; + case BidirectionalClassInternal::Right_To_Left: + return BidiClass::RightToLeft; + case BidirectionalClassInternal::Arabic_Letter: + return BidiClass::RightToLeftArabic; + case BidirectionalClassInternal::Right_To_Left_Embedding: + return BidiClass::RightToLeftEmbedding; + case BidirectionalClassInternal::Right_To_Left_Isolate: + return BidiClass::RightToLeftIsolate; + case BidirectionalClassInternal::Right_To_Left_Override: + return BidiClass::RightToLeftOverride; + case BidirectionalClassInternal::Segment_Separator: + return BidiClass::SegmentSeparator; + case BidirectionalClassInternal::White_Space: + return BidiClass::WhiteSpaceNeutral; + } + + VERIFY_NOT_REACHED(); +} + +BidiClass bidirectional_class(u32 code_point) +{ + return bidi_class_from_bidirectional_class_interal(bidirectional_class_internal(code_point).value_or(BidirectionalClassInternal::LeftToRight)); +} } diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index 53a85209efa3bb..060c25965e31f2 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -69,7 +69,35 @@ bool code_point_has_grapheme_break_property(u32 code_point, GraphemeBreakPropert bool code_point_has_word_break_property(u32 code_point, WordBreakProperty property); bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property); -Optional bidirectional_class_from_string(StringView); -Optional bidirectional_class(u32 code_point); +enum class BidirectionalClassInternal : u8; +Optional bidirectional_class_internal(u32 code_point); + +enum class BidiClass { + ArabicNumber, // AN + BlockSeparator, // B + BoundaryNeutral, // BN + CommonNumberSeparator, // CS + DirNonSpacingMark, // NSM + EuropeanNumber, // EN + EuropeanNumberSeparator, // ES + EuropeanNumberTerminator, // ET + FirstStrongIsolate, // FSI + LeftToRight, // L + LeftToRightEmbedding, // LRE + LeftToRightIsolate, // LRI + LeftToRightOverride, // LRO + OtherNeutral, // ON + PopDirectionalFormat, // PDF + PopDirectionalIsolate, // PDI + RightToLeft, // R + RightToLeftArabic, // AL + RightToLeftEmbedding, // RLE + RightToLeftIsolate, // RLI + RightToLeftOverride, // RLO + SegmentSeparator, // S + WhiteSpaceNeutral, // WS +}; + +BidiClass bidirectional_class(u32 code_point); } diff --git a/Userland/Libraries/LibUnicode/Forward.h b/Userland/Libraries/LibUnicode/Forward.h index c6c6dfd9dab01b..646838e89d4e5e 100644 --- a/Userland/Libraries/LibUnicode/Forward.h +++ b/Userland/Libraries/LibUnicode/Forward.h @@ -10,7 +10,7 @@ namespace Unicode { -enum class BidirectionalClass : u8; +enum class BidiClass; enum class Block : u16; enum class EmojiGroup : u8; enum class GeneralCategory : u8; diff --git a/Userland/Libraries/LibWeb/DOM/Element.cpp b/Userland/Libraries/LibWeb/DOM/Element.cpp index 6d329d46a6baf9..dee6476be5f808 100644 --- a/Userland/Libraries/LibWeb/DOM/Element.cpp +++ b/Userland/Libraries/LibWeb/DOM/Element.cpp @@ -2575,22 +2575,14 @@ bool Element::is_auto_directionality_form_associated_element() const // https://html.spec.whatwg.org/multipage/dom.html#auto-directionality Optional Element::auto_directionality() const { - static auto bidirectional_class_L = Unicode::bidirectional_class_from_string("L"sv); - static auto bidirectional_class_AL = Unicode::bidirectional_class_from_string("AL"sv); - static auto bidirectional_class_R = Unicode::bidirectional_class_from_string("R"sv); - - // AD-HOC: Assume 'ltr' if Unicode data generation is disabled. - if (!bidirectional_class_L.has_value()) - return Directionality::Ltr; - // https://html.spec.whatwg.org/multipage/dom.html#text-node-directionality auto text_node_directionality = [](Text const& text_node) -> Optional { // 1. If text's data does not contain a code point whose bidirectional character type is L, AL, or R, then return null. // 2. Let codePoint be the first code point in text's data whose bidirectional character type is L, AL, or R. - Optional found_character_bidi_class; + Optional found_character_bidi_class; for (auto code_point : Utf8View(text_node.data())) { auto bidi_class = Unicode::bidirectional_class(code_point); - if (first_is_one_of(bidi_class, bidirectional_class_L, bidirectional_class_AL, bidirectional_class_R)) { + if (first_is_one_of(bidi_class, Unicode::BidiClass::LeftToRight, Unicode::BidiClass::RightToLeftArabic, Unicode::BidiClass::RightToLeft)) { found_character_bidi_class = bidi_class; break; } @@ -2599,12 +2591,12 @@ Optional Element::auto_directionality() const return {}; // 3. If codePoint is of bidirectional character type AL or R, then return 'rtl'. - if (first_is_one_of(*found_character_bidi_class, bidirectional_class_AL, bidirectional_class_R)) + if (first_is_one_of(*found_character_bidi_class, Unicode::BidiClass::RightToLeftArabic, Unicode::BidiClass::RightToLeft)) return Directionality::Rtl; // 4. If codePoint is of bidirectional character type L, then return 'ltr'. // NOTE: codePoint should always be of bidirectional character type L by this point, so we can just return 'ltr' here. - VERIFY(*found_character_bidi_class == bidirectional_class_L); + VERIFY(*found_character_bidi_class == Unicode::BidiClass::LeftToRight); return Directionality::Ltr; }; @@ -2618,9 +2610,9 @@ Optional Element::auto_directionality() const // and there is no character of bidirectional character type L anywhere before it in the element's value, then return 'rtl'. for (auto code_point : Utf8View(value)) { auto bidi_class = Unicode::bidirectional_class(code_point); - if (bidi_class == bidirectional_class_L) + if (bidi_class == Unicode::BidiClass::LeftToRight) break; - if (bidi_class == bidirectional_class_AL || bidi_class == bidirectional_class_R) + if (bidi_class == Unicode::BidiClass::RightToLeftArabic || bidi_class == Unicode::BidiClass::RightToLeft) return Directionality::Rtl; }