add support for LZW compression (#116)

* add support for LZW compression * fix tests * delete test files * optimize bit shifts * unroll getcode() loop * optimize horizontal (un-)differencing * add support for pre-1.10 Julia versions * factor out prediction reversal logic so that it supports all compression techniques * check for overflows * delete test files * revert to spec behavior for codesize increases === I've found inconsistent behavior between Houdini and Pixelmator, so defaulting to behavior consistent with the spec Images will load (apparently) correctly in either case, but we show a warning for missing EOI in the non-spec case * update test * fix overflow check * bump version * add doc for TiffFileStrip * more explicit parameterization for TFS * bug fix --------- Co-authored-by: Tamas Nagy <[email protected]>
tlnagy · Oct 10, 2023 · 8c898d1 · 8c898d1 · tlnagy · Oct 13, 2023
1 parent 560ddd8
commit 8c898d1
Show file tree

Hide file tree

Showing 5 changed files with 257 additions and 23 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "TiffImages"
 uuid = "731e570b-9d59-4bfa-96dc-6df516fadf69"
 authors = ["Tamas Nagy <[email protected]>"]
-version = "0.6.8"
+version = "0.7.0"
 
 [deps]
 ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"

diff --git a/src/TiffImages.jl b/src/TiffImages.jl
@@ -23,9 +23,9 @@ const PKGVERSION = @PkgVersion.Version 0
 include("enum.jl")
 include("utils.jl")
 include("files.jl")
-include("compression.jl")
 include("tags.jl")
 include("ifds.jl")
+include("compression.jl")
 include("layout.jl")
 include(joinpath("types", "common.jl"))
 include(joinpath("types", "dense.jl"))

diff --git a/src/compression.jl b/src/compression.jl
@@ -1,39 +1,215 @@
 """
-    read!(tf, arr, comp)
+    read!(io, arr, comp)
 
-Read in an array `arr` from the [`TiffFile`](@ref) stream `tf` inflating the
-data using compression method `comp`. `read!` will dispatch on the value of
-compression and use the correct compression technique to read the data.
+Read in an array `arr` from the [`TiffFile`](@ref) or [`TiffFileStrip`](@ref)
+stream `io`, inflating the data using compression method `comp`. `read!` will
+dispatch on the value of compression and use the correct compression technique
+to read the data.
 """
-Base.read!(tf::TiffFile, arr::AbstractArray, comp::CompressionType) = read!(tf, arr, Val(comp))
+function memcpy(dest::Ptr{T}, src::Ptr{T}, n::Int) where T
+    ccall(:memcpy, Ptr{T}, (Ptr{T}, Ptr{T}, Int), dest, src, n)
+end
+
+Base.read!(io::Union{TiffFile, TiffFileStrip}, arr::AbstractArray, comp::CompressionType) = read!(io, arr, Val(comp))
 
-Base.read!(tf::TiffFile, arr::AbstractArray, ::Val{COMPRESSION_NONE}) = read!(tf, arr)
+Base.read!(io::Union{TiffFile, TiffFileStrip}, arr::AbstractArray, ::Val{COMPRESSION_NONE}) = read!(io, arr)
 
-function Base.read!(tf::TiffFile, arr::AbstractArray{T, N}, ::Val{COMPRESSION_PACKBITS}) where {T, N}
+function Base.read!(tfs::TiffFileStrip, arr::AbstractArray{T, N}, ::Val{COMPRESSION_PACKBITS}) where {T, N}
     pos = 1
     nbit = Array{Int8}(undef, 1)
     nxt = Array{T}(undef, 1)
     while pos < length(arr)
-        read!(tf, nbit)
+        read!(tfs.tf, nbit)
         n = nbit[1]
         if 0 <= n <= 127
-            read!(tf, view(arr, pos:pos+n))
+            read!(tfs.tf, view(arr, pos:pos+n))
             pos += n
         elseif -127 <= n <= -1
-            read!(tf, nxt)
+            read!(tfs.tf, nxt)
             arr[pos:(pos-n)] .= nxt[1]
             pos += -n
         end
         pos += 1
     end
 end
 
-function Base.read!(tf::TiffFile, arr::AbstractArray, ::Val{COMPRESSION_DEFLATE})
-    readbytes!(InflateZlibStream(tf.io.io), reinterpret(UInt8, vec(arr)))
+function Base.read!(tfs::TiffFileStrip, arr::AbstractArray, ::Val{COMPRESSION_DEFLATE})
+    readbytes!(InflateZlibStream(tfs.tf.io.io), reinterpret(UInt8, vec(arr)))
 end
 
-function Base.read!(tf::TiffFile, arr::AbstractArray, ::Val{COMPRESSION_ADOBE_DEFLATE})
-    readbytes!(InflateZlibStream(tf.io.io), reinterpret(UInt8, vec(arr)))
+function Base.read!(tfs::TiffFileStrip, arr::AbstractArray, ::Val{COMPRESSION_ADOBE_DEFLATE})
+    readbytes!(InflateZlibStream(tfs.tf.io.io), reinterpret(UInt8, vec(arr)))
+end
+
+function lzw_decode!(io, arr::AbstractArray)
+    CLEAR_CODE::Int = 256 + 1
+    EOI_CODE::Int = 257 + 1
+    TABLE_ENTRY_LENGTH_BITS::Int = 16
+
+    out_pointer::Ptr{UInt8} = reinterpret(Ptr{UInt8}, pointer(arr))
+    output_size::Int = sizeof(arr)
+    out_position::Int = 0 # current position in out
+
+    table_size::Int = output_size * 2 + 258
+    table_pointer::Ptr{UInt8} = reinterpret(Ptr{UInt8}, Libc.malloc(table_size)) # table of strings
+    table_offsets_pointer::Ptr{Int} = reinterpret(Ptr{Int}, Libc.malloc(sizeof(Int) * 4097)) # offsets into table
+
+    @inline create_table_entry(length, offset) = Base.shl_int(length, (64 - TABLE_ENTRY_LENGTH_BITS)) | offset
+    @inline table_entry_length(table_entry) = Base.lshr_int(table_entry, 64 - TABLE_ENTRY_LENGTH_BITS)
+    @inline table_entry_offset(table_entry) = table_entry & (Base.shl_int(1, 64 - TABLE_ENTRY_LENGTH_BITS) - 1)
+
+    try
+        # InitializeTable();
+        foreach(i -> unsafe_store!(table_pointer + i, UInt8(i)), 0:255)
+        foreach(i -> unsafe_store!(table_offsets_pointer, create_table_entry(1, i), i+1), 0:259) # length is stored in upper 16 bits
+
+        code = -1
+
+        buffer::Int=0 # buffer for reading in codes
+        bitcount::Int=0 # number of valid bits in buffer
+        codesize::Int=9 # current number of bits per code
+        input::Vector{UInt8} = Vector{UInt8}(undef, bytesavailable(io))
+        read!(io, input)
+        function getcode(buffer, code, bitcount, codesize, i)
+            old_code::Int = code
+
+            # make sure we have enough bits in the buffer
+            if bitcount < codesize
+                buffer = Base.shl_int(buffer, 8) | input[i+=1]
+                bitcount += 8
+            end
+
+            # one more time (since the max code size is 12 bits, only need to check twice)
+            if bitcount < codesize
+                buffer = Base.shl_int(buffer, 8) | input[i+=1]
+                bitcount += 8
+            end
+
+            code = Base.lshr_int(buffer, bitcount - codesize) & (Base.shl_int(1, codesize) - 1)
+            bitcount -= codesize
+            # code + 1 because this is Julia
+            (buffer, code + 1, old_code, bitcount, codesize, i)
+        end
+
+        @inline check_table_overflow(start, length) = start + length > table_size && @error "LZW: table buffer overflow"
+        @inline check_output_overflow(start, length) = start + length > output_size && @error "LZW: output buffer overflow"
+
+        # annotated with excerpts from the LZW pseudocode in the TIFF 6.0 spec
+        # https://developer.adobe.com/content/dam/udp/en/open/standards/tiff/TIFF6.pdf
+        table_count::Int = 258 # number of (valid) table entries; 256 one-byte codes + CLEAR_CODE + EOI_CODE
+        next_table_offset::Int = 258
+        input_pos::Int = 0 # current position in input
+        while true
+            # GetNextCode()
+            (buffer, code, old_code, bitcount, codesize, input_pos) = getcode(buffer, code, bitcount, codesize, input_pos)
+            if code == EOI_CODE || out_position >= output_size
+                break
+            elseif code == CLEAR_CODE # reset table
+                # InitializeTable();
+                table_count = 258
+                next_table_offset = 258
+                codesize = 9
+                # Code = GetNextCode();
+                (buffer, code, old_code, bitcount, codesize, input_pos) = getcode(buffer, code, bitcount, codesize, input_pos)
+                if code == EOI_CODE
+                    break
+                end
+                # WriteString(StringFromCode(Code))
+                r = unsafe_load(table_offsets_pointer, code)
+                len = table_entry_length(r)
+
+                check_output_overflow(out_position, len)
+
+                memcpy(out_pointer + out_position, table_pointer + table_entry_offset(r), len)
+                out_position += len
+            else
+                if code <= table_count
+                    # WriteString(StringFromCode(Code));
+                    if code <= 256
+                        # this is redundant with the check above, but it makes
+                        # the code easier to reason about and less bug prone
+                        check_output_overflow(out_position, 1)
+
+                        unsafe_store!(out_pointer + out_position, code - 1)
+                        out_position += 1
+                    else
+                        r = unsafe_load(table_offsets_pointer, code)
+                        len = table_entry_length(r)
+
+                        check_output_overflow(out_position, len)
+
+                        memcpy(out_pointer + out_position, table_pointer + table_entry_offset(r), len)
+                        out_position += len
+                    end
+
+                    # AddStringToTable(StringFromCode(OldCode) + FirstChar(StringFromCode(Code)));
+                    table_count += 1
+                    len = 1
+                    if old_code <= 256
+                        check_table_overflow(next_table_offset, 2) # this byte + the next one
+
+                        unsafe_store!(table_pointer + next_table_offset, UInt8(old_code - 1))
+                    else
+                        r = unsafe_load(table_offsets_pointer, old_code)
+                        len = table_entry_length(r)
+
+                        check_table_overflow(next_table_offset, len + 1) # these bytes + the next one
+
+                        memcpy(table_pointer + next_table_offset, table_pointer + table_entry_offset(r), len)
+                    end
+
+                    if code <= 256
+                        unsafe_store!(table_pointer + next_table_offset + len, UInt8(code - 1))
+                    else
+                        r = unsafe_load(table_offsets_pointer, code)
+                        memcpy(table_pointer + next_table_offset + len, table_pointer + table_entry_offset(r), 1)
+                    end
+                    unsafe_store!(table_offsets_pointer, create_table_entry(len + 1, next_table_offset), table_count)
+                    next_table_offset += len + 1
+                else
+                    # WriteString(StringFromCode(OldCode) + FirstChar(StringFromCode(OldCode)));
+                    r = unsafe_load(table_offsets_pointer, old_code)
+                    len = table_entry_length(r)
+
+                    check_output_overflow(out_position, len + 1)
+
+                    memcpy(out_pointer + out_position, table_pointer + table_entry_offset(r), len)
+                    unsafe_store!(out_pointer + out_position + len, unsafe_load(table_pointer + table_entry_offset(r)))
+                    out_position += len + 1
+
+                    check_table_overflow(next_table_offset, len + 1)
+
+                    # AddStringToTable(StringFromCode(OldCode) + FirstChar(StringFromCode(OldCode)));
+                    table_count += 1
+                    memcpy(table_pointer + next_table_offset, table_pointer + table_entry_offset(r), len)
+                    memcpy(table_pointer + next_table_offset + len, table_pointer + table_entry_offset(r), 1)
+                    unsafe_store!(table_offsets_pointer, create_table_entry(len + 1, next_table_offset), table_count)
+                    next_table_offset += len + 1
+                end
+            end
+
+            if table_count == 511
+                codesize = 10
+            elseif table_count == 1023
+                codesize = 11
+            elseif table_count == 2047
+                codesize = 12
+            end
+        end
+
+        out_position != output_size && @warn "LZW: expected $output_size bytes, got $out_position bytes"
+        out_position == output_size && code != EOI_CODE && @warn "LZW: missing EOI code"
+    catch e
+        error("LZW: $e")
+        rethrow()
+    finally
+        Libc.free(table_pointer)
+        Libc.free(table_offsets_pointer)
+    end
+end
+
+function Base.read!(tfs::TiffFileStrip{S}, arr::AbstractArray{T, N}, ::Val{COMPRESSION_LZW}) where {T, N, S}
+    lzw_decode!(tfs, arr)
 end
 
 """
@@ -46,15 +222,16 @@ julia> TiffImages.get_inflator(first(methods(read!, [TiffImages.TiffFile, Abstra
 COMPRESSION_NONE::CompressionType = 1
 ```
 """
-get_inflator(::Type{Tuple{typeof(read!), TiffFile, AbstractArray{T, N} where {T, N}, Val{C}}}) where C = C
+get_inflator(::Type{Tuple{typeof(read!), TiffFileStrip, AbstractArray{T, N} where {T, N}, Val{C}}}) where C = C
+get_inflator(::Type{Tuple{typeof(read!), Union{TiffFile, TiffFileStrip{S} where S}, AbstractArray{T, N} where {T, N}, Val{C}}}) where C = C
 
 # autogenerate nice error messages for all non-implemented inflation methods
-implemented = map(x->get_inflator(x.sig), methods(read!, [TiffFile, AbstractArray, Val], ))
+implemented = map(x->get_inflator(x.sig), methods(read!, [Union{TiffFile, TiffFileStrip}, AbstractArray, Val], ))
 comps = Set(instances(CompressionType))
 setdiff!(comps, implemented)
 
 for comp in comps
     eval(quote
-        Base.read!(tf::TiffFile, arr::AbstractArray, ::Val{$comp}) = error("Compression ", $comp, " is not implemented. Please open an issue against TiffImages.jl.")
+        Base.read!(io::Union{TiffFile, TiffFileStrip}, arr::AbstractArray, ::Val{$comp}) = error("Compression ", $comp, " is not implemented. Please open an issue against TiffImages.jl.")
     end)
-end
+end
diff --git a/src/ifds.jl b/src/ifds.jl
@@ -203,7 +203,28 @@ function Base.iterate(file::TiffFile, state::Tuple{Union{IFD{O}, Nothing}, Int})
     return (curr_ifd, (next_ifd, next_ifd_offset))
 end
 
-function Base.read!(target::AbstractArray{T, N}, tf::TiffFile, ifd::IFD) where {T, N}
+"""
+    $(TYPEDEF)
+
+A strip is a contiguous block of separately-encoded image data. A TIFF
+file will typically have multiple strips, each representing multiple rows of
+pixels in the image
+
+$(FIELDS)
+"""
+struct TiffFileStrip{O, S, P}
+    """The file stream"""
+    tf::TiffFile{O, S}
+    """The IFD corresponding to this strip"""
+    ifd::IFD{O}
+    """The number of bytes in this strip"""
+    bytes::Int
+end
+
+Base.read!(tfs::TiffFileStrip, arr::AbstractArray) = read!(tfs.tf, arr)
+Base.bytesavailable(tfs::TiffFileStrip) = tfs.bytes
+
+function Base.read!(target::AbstractArray{T, N}, tf::TiffFile{O, S}, ifd::IFD{O}) where {T, N, O, S}
     strip_offsets = ifd[STRIPOFFSETS].data
 
     if PLANARCONFIG in ifd
@@ -229,11 +250,18 @@ function Base.read!(target::AbstractArray{T, N}, tf::TiffFile, ifd::IFD) where {
             strip_nbytes[end] = (rows - (rowsperstrip * (nstrips-1))) * cols * sizeof(T)
         end
 
+        bytes = ifd[STRIPBYTECOUNTS].data
+
         startbyte = 1
+        comp = Val(compression)
+        rtype = rawtype(ifd)
         for i in 1:nstrips
             seek(tf, strip_offsets[i]::Core.BuiltinInts)
             nbytes = Int(strip_nbytes[i]::Core.BuiltinInts / sizeof(T))
-            read!(tf, view(target, startbyte:(startbyte+nbytes-1)), compression)
+            tfs = TiffFileStrip{O, S, rtype}(tf, ifd, bytes[i])
+            arr = view(target, startbyte:(startbyte+nbytes-1))
+            read!(tfs, arr, comp)
+            reverse_prediction!(tfs, arr)
             startbyte += nbytes
         end
     else
@@ -297,3 +325,26 @@ function Base.write(tf::TiffFile{O}, ifd::IFD{O}) where {O <: Unsigned}
 
     return ifd_end_pos
 end
+
+function reverse_prediction!(tfs::TiffFileStrip{O, S, P}, arr::AbstractArray{T, N}) where {O, S, P, T, N}
+    predictor::Int = Int(getdata(tfs.ifd, PREDICTOR, 0))
+    spp::Int = Int(getdata(tfs.ifd, SAMPLESPERPIXEL, 0))
+    if predictor == 2
+        columns = Int(ncols(tfs.ifd))
+        rows = cld(length(arr), columns) # number of rows in this strip
+
+        # horizontal differencing
+        temp::Ptr{P} = reinterpret(Ptr{P}, pointer(arr))
+        for row in 1:rows
+            start = (row - 1) * columns * spp
+            for plane in 1:spp
+                previous::P = unsafe_load(temp, start + plane)
+                for i in (spp + plane):spp:(columns - 1) * spp + plane
+                    current = unsafe_load(temp, start + i) + previous
+                    unsafe_store!(temp, current, start + i)
+                    previous = current
+                end
+            end
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -215,4 +215,10 @@ end
     ifd[TiffImages.ROWSPERSTRIP] = 256
 
     @test TiffImages.iscontiguous(ifd)
-end
+end
+
+@testset "LZW" begin
+    uncompressed = get_example("shapes_uncompressed.tif")
+    compressed = get_example("shapes_lzw.tif")
+    @test TiffImages.load(uncompressed) == TiffImages.load(compressed)
+end