From 5cf15fe14aaf9da4d83a8353c9012f3ff0e55605 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 9 May 2024 19:29:38 -0400
Subject: [PATCH] LibGfx/WebPWriter: Use huffman compression

This implements some of basic webp compression: Huffman coding.
(The other parts of the basics are backreferences, and color cache
entries; and after that there are the four transforms -- predictor,
subtract green, color indexing, color.)

How much huffman coding helps depends on the input's entropy.
Constant-color channels are now encoded in constant space, but
otherwise a huffman code always needs at least one bit per symbol.
This means just huffman coding can at the very best reduce output
size to 1/8th of input size.

For three test input files:

sunset-retro.png (876K): 2.25M -> 2.02M
(helps fairly little; from 2.6x as big as the png input to 2.36x)

giphy.gif (184k): 11M -> 4.9M
(pretty decent, from 61x as big as the gif input to 27x as big)

7z7c.gif (11K): 775K -> 118K
(almost as good as possible an improvement for just huffman coding,
from 70x as big as the gif input to 10.7x as big)

No measurable encoding perf impact for encoding.

The code is pretty similar to Deflate.cpp in LibCompress, with just
enough differences that sharing code doesn't look like it's worth
it to me. I left comments outlining similarities.
---
 .../ImageFormats/WebPWriterLossless.cpp       | 286 ++++++++++++++----
 1 file changed, 220 insertions(+), 66 deletions(-)
diff --git a/Userland/Libraries/LibGfx/ImageFormats/WebPWriterLossless.cpp b/Userland/Libraries/LibGfx/ImageFormats/WebPWriterLossless.cpp
index d706f41dce2dba..59492410da06f2 100644
--- a/Userland/Libraries/LibGfx/ImageFormats/WebPWriterLossless.cpp
+++ b/Userland/Libraries/LibGfx/ImageFormats/WebPWriterLossless.cpp
@@ -11,21 +11,13 @@
 #include <AK/Endian.h>
 #include <AK/MemoryStream.h>
 #include <LibCompress/DeflateTables.h>
+#include <LibCompress/Huffman.h>
 #include <LibGfx/Bitmap.h>
 #include <LibGfx/ImageFormats/WebPSharedLossless.h>
 #include <LibGfx/ImageFormats/WebPWriterLossless.h>
 
 namespace Gfx {
 
-static bool are_all_pixels_opaque(Bitmap const& bitmap)
-{
-    for (ARGB32 pixel : bitmap) {
-        if ((pixel >> 24) != 0xff)
-            return false;
-    }
-    return true;
-}
-
 NEVER_INLINE static ErrorOr<void> write_image_data(LittleEndianOutputBitStream& bit_stream, Bitmap const& bitmap, PrefixCodeGroup const& prefix_code_group)
 {
     // This is currently the hot loop. Keep performance in mind when you change it.
@@ -43,6 +35,184 @@ NEVER_INLINE static ErrorOr<void> write_image_data(LittleEndianOutputBitStream&
     return {};
 }
 
+struct code_length_symbol {
+    u8 symbol;
+    u8 count; // used for special symbols 16-18
+};
+
+// This is very similar to DeflateCompressor::encode_huffman_lengths().
+// But:
+// * size can be larger than 288 for green, is always 256 for r, b, a, and is always 40 for distance codes
+// * code 16 has different semantics, requires last_non_zero_symbol
+static size_t encode_huffman_lengths(ReadonlyBytes lengths, Array<code_length_symbol, 256>& encoded_lengths)
+{
+    size_t encoded_count = 0;
+    size_t i = 0;
+    u8 last_non_zero_symbol = 8; // "If code 16 is used before a non-zero value has been emitted, a value of 8 is repeated."
+    while (i < lengths.size()) {
+        if (lengths[i] == 0) {
+            auto zero_count = 0;
+            for (size_t j = i; j < min(lengths.size(), i + 138) && lengths[j] == 0; j++)
+                zero_count++;
+
+            if (zero_count < 3) { // below minimum repeated zero count
+                encoded_lengths[encoded_count++].symbol = 0;
+                i++;
+                continue;
+            }
+
+            if (zero_count <= 10) {
+                // "Code 17 emits a streak of zeros [3..10], i.e., 3 + ReadBits(3) times."
+                encoded_lengths[encoded_count].symbol = 17;
+                encoded_lengths[encoded_count++].count = zero_count;
+            } else {
+                // "Code 18 emits a streak of zeros of length [11..138], i.e., 11 + ReadBits(7) times."
+                encoded_lengths[encoded_count].symbol = 18;
+                encoded_lengths[encoded_count++].count = zero_count;
+            }
+            i += zero_count;
+            continue;
+        }
+
+        VERIFY(lengths[i] != 0);
+        last_non_zero_symbol = lengths[i];
+        encoded_lengths[encoded_count++].symbol = lengths[i++];
+
+        // "Code 16 repeats the previous non-zero value [3..6] times, i.e., 3 + ReadBits(2) times."
+        // This is different from deflate.
+        auto copy_count = 0;
+        for (size_t j = i; j < min(lengths.size(), i + 6) && lengths[j] == last_non_zero_symbol; j++)
+            copy_count++;
+
+        if (copy_count >= 3) {
+            encoded_lengths[encoded_count].symbol = 16;
+            encoded_lengths[encoded_count++].count = copy_count;
+            i += copy_count;
+            continue;
+        }
+    }
+    return encoded_count;
+}
+
+static ErrorOr<CanonicalCode> write_simple_code_lengths(LittleEndianOutputBitStream& bit_stream, ReadonlyBytes symbols)
+{
+    VERIFY(symbols.size() <= 2);
+
+    static constexpr Array<u8, 1> empty { 0 };
+    if (symbols.size() == 0) {
+        // "Another special case is when all prefix code lengths are zeros (an empty prefix code). [...]
+        //  empty prefix codes can be coded as those containing a single symbol 0."
+        symbols = empty;
+    }
+
+    unsigned non_zero_symbol_count = symbols.size();
+
+    TRY(bit_stream.write_bits(1u, 1u));                        // Simple code length code.
+    TRY(bit_stream.write_bits(non_zero_symbol_count - 1, 1u)); // num_symbols - 1
+    if (symbols[0] <= 1) {
+        TRY(bit_stream.write_bits(0u, 1u));         // is_first_8bits: no
+        TRY(bit_stream.write_bits(symbols[0], 1u)); // symbol0
+    } else {
+        TRY(bit_stream.write_bits(1u, 1u));         // is_first_8bits: yes
+        TRY(bit_stream.write_bits(symbols[0], 8u)); // symbol0
+    }
+    if (non_zero_symbol_count > 1)
+        TRY(bit_stream.write_bits(symbols[1], 8u)); // symbol1
+
+    Array<u8, 256> bits_per_symbol {};
+    // "When coding a single leaf node [...], all but one code length are zeros, and the single leaf node value
+    //  is marked with the length of 1 -- even when no bits are consumed when that single leaf node tree is used."
+    // CanonicalCode follows that convention too, even when describing simple code lengths.
+    bits_per_symbol[symbols[0]] = 1;
+    if (non_zero_symbol_count > 1)
+        bits_per_symbol[symbols[1]] = 1;
+
+    return MUST(CanonicalCode::from_bytes(bits_per_symbol));
+}
+
+static ErrorOr<CanonicalCode> write_normal_code_lengths(LittleEndianOutputBitStream& bit_stream, Array<u8, 256> const& bit_lengths, size_t alphabet_size)
+{
+    // bit_lengths stores how many bits each symbol is encoded with.
+
+    // Drop trailing zero lengths.
+    // This will keep at least three symbols; else we would've called write_simple_code_lengths() instead.
+    // This is similar to the loops in Deflate::encode_block_lengths().
+    size_t code_count = bit_lengths.size();
+    while (bit_lengths[code_count - 1] == 0) {
+        code_count--;
+        VERIFY(code_count > 2);
+    }
+
+    Array<code_length_symbol, 256> encoded_lengths {};
+    auto encoded_lengths_count = encode_huffman_lengths(bit_lengths.span().trim(code_count), encoded_lengths);
+
+    // The code to compute code length code lengths is very similar to some of the code in DeflateCompressor::flush().
+    // count code length frequencies
+    Array<u16, 19> code_lengths_frequencies { 0 };
+    for (size_t i = 0; i < encoded_lengths_count; i++) {
+        VERIFY(code_lengths_frequencies[encoded_lengths[i].symbol] < UINT16_MAX);
+        code_lengths_frequencies[encoded_lengths[i].symbol]++;
+    }
+
+    // generate optimal huffman code lengths code lengths
+    Array<u8, 19> code_lengths_bit_lengths {};
+    Compress::generate_huffman_lengths(code_lengths_bit_lengths, code_lengths_frequencies, 7); // deflate code length huffman can use up to 7 bits per symbol
+    // calculate actual code length code lengths count (without trailing zeros)
+    auto code_lengths_count = code_lengths_bit_lengths.size();
+    while (code_lengths_bit_lengths[kCodeLengthCodeOrder[code_lengths_count - 1]] == 0)
+        code_lengths_count--;
+
+    TRY(bit_stream.write_bits(0u, 1u)); // Normal code length code.
+
+    // This here isn't needed in Deflate because it always writes EndOfBlock. WebP does not have an EndOfBlock marker, so it needs this check.
+    if (code_lengths_count < 4)
+        code_lengths_count = 4;
+    dbgln_if(WEBP_DEBUG, "writing code_lengths_count: {}", code_lengths_count);
+
+    // WebP uses a different kCodeLengthCodeOrder than deflate. Other than that, the following is similar to a loop in Compress::write_dynamic_huffman().
+    // "int num_code_lengths = 4 + ReadBits(4);"
+    TRY(bit_stream.write_bits(code_lengths_count - 4u, 4u));
+
+    for (size_t i = 0; i < code_lengths_count; i++) {
+        TRY(bit_stream.write_bits(code_lengths_bit_lengths[kCodeLengthCodeOrder[i]], 3));
+    }
+
+    // Write code lengths. This is slightly different from deflate too -- deflate writes literal and distance lengths here,
+    // while WebP writes one of these codes each for g, r, b, a, and distance.
+    if (alphabet_size == encoded_lengths_count) {
+        TRY(bit_stream.write_bits(0u, 1u)); // max_symbol is alphabet_size
+    } else {
+        TRY(bit_stream.write_bits(1u, 1u)); // max_symbol is explicitly coded
+        // "int length_nbits = 2 + 2 * ReadBits(3);
+        //  int max_symbol = 2 + ReadBits(length_nbits);"
+        // => length_nbits is at most 2 + 2*7 == 16
+        unsigned needed_length_nbits = ceil(log2(encoded_lengths_count - 2));
+        VERIFY(needed_length_nbits <= 16);
+        needed_length_nbits = ceil_div(needed_length_nbits, 2) * 2;
+        TRY(bit_stream.write_bits((needed_length_nbits - 2) / 2, 3u));              // length_nbits = 2 + 2 * 3 == 8
+        TRY(bit_stream.write_bits(encoded_lengths_count - 2, needed_length_nbits)); // max_symbol = 2 + 254
+    }
+
+    // The rest is identical to write_dynamic_huffman() again. (Code 16 has different semantics, but that doesn't matter here.)
+    auto code_lengths_code = MUST(CanonicalCode::from_bytes(code_lengths_bit_lengths));
+    for (size_t i = 0; i < encoded_lengths_count; i++) {
+        auto encoded_length = encoded_lengths[i];
+        TRY(code_lengths_code.write_symbol(bit_stream, encoded_length.symbol));
+        if (encoded_length.symbol == 16) {
+            // "Code 16 repeats the previous non-zero value [3..6] times, i.e., 3 + ReadBits(2) times."
+            TRY(bit_stream.write_bits<u8>(encoded_length.count - 3, 2));
+        } else if (encoded_length.symbol == 17) {
+            // "Code 17 emits a streak of zeros [3..10], i.e., 3 + ReadBits(3) times."
+            TRY(bit_stream.write_bits<u8>(encoded_length.count - 3, 3));
+        } else if (encoded_length.symbol == 18) {
+            // "Code 18 emits a streak of zeros of length [11..138], i.e., 11 + ReadBits(7) times."
+            TRY(bit_stream.write_bits<u8>(encoded_length.count - 11, 7));
+        }
+    }
+
+    return CanonicalCode::from_bytes(bit_lengths.span().trim(code_count));
+}
+
 static ErrorOr<void> write_VP8L_image_data(Stream& stream, Bitmap const& bitmap)
 {
     LittleEndianOutputBitStream bit_stream { MaybeOwned<Stream>(stream) };
@@ -75,78 +245,62 @@ static ErrorOr<void> write_VP8L_image_data(Stream& stream, Bitmap const& bitmap)
     //  Prefix code #5: Used for backward-reference distance."
 
     // We use neither back-references not color cache entries yet.
-    // We write prefix trees for 256 literals all of length 8, which means each byte is encoded as itself.
-    // That doesn't give any compression, but is a valid bit stream.
     // We can make this smarter later on.
 
     size_t const color_cache_size = 0;
-    constexpr Array alphabet_sizes = to_array<size_t>({ 256 + 24 + static_cast<size_t>(color_cache_size), 256, 256, 256, 40 }); // XXX Shared?
+    constexpr Array alphabet_sizes = to_array<size_t>({ 256 + 24 + static_cast<size_t>(color_cache_size), 256, 256, 256, 40 });
 
     // If you add support for color cache: At the moment, CanonicalCodes does not support writing more than 288 symbols.
     if (alphabet_sizes[0] > 288)
         return Error::from_string_literal("Invalid alphabet size");
 
-    bool all_pixels_are_opaque = are_all_pixels_opaque(bitmap);
-
-    PrefixCodeGroup prefix_code_group;
-    int number_of_full_channels = all_pixels_are_opaque ? 3 : 4;
-    for (int i = 0; i < number_of_full_channels; ++i) {
-        TRY(bit_stream.write_bits(0u, 1u)); // Normal code length code.
-
-        // Write code length codes.
-        constexpr int kCodeLengthCodes = 19;
-        Array<int, kCodeLengthCodes> kCodeLengthCodeOrder = { 17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
-        int num_code_lengths = max(4u, find_index(kCodeLengthCodeOrder.begin(), kCodeLengthCodeOrder.end(), 8) + 1);
-
-        // "int num_code_lengths = 4 + ReadBits(4);"
-        TRY(bit_stream.write_bits(num_code_lengths - 4u, 4u));
-
-        for (int i = 0; i < num_code_lengths - 1; ++i)
-            TRY(bit_stream.write_bits(0u, 3u));
-        TRY(bit_stream.write_bits(1u, 3u));
-
-        // Write code lengths.
-        if (alphabet_sizes[i] == 256) {
-            TRY(bit_stream.write_bits(0u, 1u)); // max_symbol is alphabet_size
-        } else {
-            TRY(bit_stream.write_bits(1u, 1u)); // max_symbol is explicitly coded
-            // "int length_nbits = 2 + 2 * ReadBits(3);
-            //  int max_symbol = 2 + ReadBits(length_nbits);"
-            TRY(bit_stream.write_bits(3u, 3u));   // length_nbits = 2 + 2 * 3
-            TRY(bit_stream.write_bits(254u, 8u)); // max_symbol = 2 + 254
-        }
+    // We do use huffman coding by writing a single prefix-code-group for the entire image.
+    // FIXME: Consider using a meta-prefix image and using one prefix-code-group per tile.
 
-        auto bits_per_symbol = Array<u8, 256>::from_repeated_value(8);
-        prefix_code_group[i] = TRY(CanonicalCode::from_bytes(bits_per_symbol));
+    // FIXME: generate_huffman_lengths() currently halves a frequency cap if the maximum bit length is reached.
+    //        This has the effect of giving very frequent symbols a higher bit length than they would have otherwise.
+    //        Instead, try dividing frequencies by 2 if the maximum bit length is reached.
+    //        Then, low-frequency symbols will get a higher bit length than they would have otherwise, which might help
+    //        compressed size. (For deflate, it doesn't matter much since their blocks are 64kiB large, but for WebP
+    //        we currently use a single huffman tree per channel for the entire image.)
+    Array<Array<u16, 256>, 4> symbol_frequencies {};
+    for (ARGB32 pixel : bitmap) {
+        static constexpr auto saturating_increment = [](u16& value) {
+            if (value < UINT16_MAX)
+                value++;
+        };
+        saturating_increment(symbol_frequencies[0][(pixel >> 8) & 0xff]);  // green
+        saturating_increment(symbol_frequencies[1][(pixel >> 16) & 0xff]); // red
+        saturating_increment(symbol_frequencies[2][pixel & 0xff]);         // blue
+        saturating_increment(symbol_frequencies[3][pixel >> 24]);          // alpha
+    }
 
-        // The code length codes only contain a single entry for '8'. WebP streams with a single element store 0 bits per element.
-        // (This is different from deflate, which needs 1 bit per element.)
+    Array<Array<u8, 256>, 4> code_lengths {};
+    for (int i = 0; i < 4; ++i) {
+        // "Code [0..15] indicates literal code lengths." => the maximum bit length is 15.
+        Compress::generate_huffman_lengths(code_lengths[i], symbol_frequencies[i], 15);
     }
 
-    if (all_pixels_are_opaque) {
-        // Use a simple 1-element code.
-        TRY(bit_stream.write_bits(1u, 1u));   // Simple code length code.
-        TRY(bit_stream.write_bits(0u, 1u));   // num_symbols - 1
-        TRY(bit_stream.write_bits(1u, 1u));   // is_first_8bits
-        TRY(bit_stream.write_bits(255u, 8u)); // symbol0
-        Array<u8, 256> bits_per_symbol {};
-        // "When coding a single leaf node [...], all but one code length are zeros, and the single leaf node value
-        //  is marked with the length of 1 -- even when no bits are consumed when that single leaf node tree is used."
-        // CanonicalCode follows that convention too, even when describing simple code lengths.
-        bits_per_symbol[255] = 1;
-        prefix_code_group[3] = TRY(CanonicalCode::from_bytes(bits_per_symbol));
+    PrefixCodeGroup prefix_code_group;
+    for (int i = 0; i < 4; ++i) {
+        u8 symbols[2];
+        unsigned non_zero_symbol_count = 0;
+        for (int j = 0; j < 256; ++j) {
+            if (code_lengths[i][j] != 0) {
+                if (non_zero_symbol_count < 2)
+                    symbols[non_zero_symbol_count] = j;
+                non_zero_symbol_count++;
+            }
+        }
+
+        if (non_zero_symbol_count <= 2)
+            prefix_code_group[i] = TRY(write_simple_code_lengths(bit_stream, { symbols, non_zero_symbol_count }));
+        else
+            prefix_code_group[i] = TRY(write_normal_code_lengths(bit_stream, code_lengths[i], alphabet_sizes[i]));
     }
 
     // For code #5, use a simple empty code, since we don't use this yet.
-    // "Note: Another special case is when all prefix code lengths are zeros (an empty prefix code). [...]
-    //  empty prefix codes can be coded as those containing a single symbol 0."
-    TRY(bit_stream.write_bits(1u, 1u)); // Simple code length code.
-    TRY(bit_stream.write_bits(0u, 1u)); // num_symbols - 1
-    TRY(bit_stream.write_bits(0u, 1u)); // is_first_8bits
-    TRY(bit_stream.write_bits(0u, 1u)); // symbol0
-    Array<u8, 256> bits_per_symbol {};
-    bits_per_symbol[0] = 1; // See comment in `if (all_pixels_are_opaque)` block above.
-    prefix_code_group[4] = TRY(CanonicalCode::from_bytes(bits_per_symbol));
+    prefix_code_group[4] = TRY(write_simple_code_lengths(bit_stream, {}));
 
     // Image data.
     TRY(write_image_data(bit_stream, bitmap, prefix_code_group));