-
Notifications
You must be signed in to change notification settings - Fork 3
/
ShuffleCoder8.h
683 lines (591 loc) · 28.7 KB
/
ShuffleCoder8.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
// Copyright (c) 2015 Jan Wassenberg
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#pragma once
#include "Indices.h"
#include "RemoveZeros.h"
#include "Pack.h"
namespace codec {
// special case for two 1-bits out of 8. a straightforward representation
// involves two 3-bit positions, but there are only 28 valid combinations.
// enumerative coding associates each possible outcome with a 5-bit count.
// 4 values remain unused; we choose the interval [0,4) so that zero inputs
// (for non-enum5 cases) decode to zero without any extra masking.
//
// this same logic is also used in U4 for pairs of nibbles.
// `half' means either nibble is { 3,5,6,9,A,C } and the other nibble is zero.
struct TwoBitsHalf
{
// 3 ops
// @param p_ExclusiveL/H [0, 16), 0 if other nibble is nonzero
// @return [4, 16) or 0
static INLINE VecU8 Encode(const VecU8& p_ExclusiveL, const VecU8& p_ExclusiveH)
{
ASSERT(p_ExclusiveL[0] < 16);
ASSERT(p_ExclusiveH[0] < 16);
const VecU8 tableL(0, 0, 0, 9, 0, 8, 7, 0, 0, 6, 5, 0, 4, 0, 0, 0);
const VecU8 tableH(0, 0, 0, 15, 0, 14, 13, 0, 0, 12, 11, 0, 10, 0, 0, 0);
const VecU8 enum5L = Shuffle(tableL, p_ExclusiveL);
const VecU8 enum5H = Shuffle(tableH, p_ExclusiveH);
// (L/H are both zero, or exactly one is valid and nonzero => sum is valid or 0)
return enum5L + enum5H;
}
// 1 op
// @param p_Enum5L [4, 16) or 0 if zero, single or raw8
// @return decoded 8-bit value with two 1-bits, or 0 if p_Enum5 == 0.
static INLINE VecU8 Decode(const VecU8& p_Enum5L)
{
const VecU8 table(0xC0, 0xA0, 0x90, 0x60, 0x50, 0x30, 0x0C, 0x0A, 9, 6, 5, 3, 0, 0, 0, 0);
return Shuffle(table, p_Enum5L);
}
static void Test()
{
const Uint8 values[12] = { 3, 5, 6, 9, 0xA, 0xC, 0x30, 0x50, 0x60, 0x90, 0xA0, 0xC0 };
for (size_t val = 0; val < 256; ++val)
{
const size_t nibbleL = val & 0xF;
const size_t nibbleH = val >> 4;
const size_t exclusiveL = (nibbleH == 0) ? nibbleL : 0;
const size_t exclusiveH = (nibbleL == 0) ? nibbleH : 0;
const VecU8 encoded = Encode(VecU8(Uint8(exclusiveL)), VecU8(Uint8(exclusiveH)));
const VecU8 decoded = Decode(encoded);
const Uint8* pos = std::find(values, values + 12, val);
if (pos == values + 12) // not one of the values we can represent
{
ENSURE(encoded[0] == 0);
ENSURE(decoded[0] == 0);
}
else // encoded as index
{
ENSURE(encoded[0] == 4 + pos - values);
ENSURE(decoded[0] == val);
}
}
}
};
// `both' means exactly one bit in each nibble is set.
struct TwoBitsBoth
{
// 5 ops
// @param p_NibbleL/H [0, 16)
// @return [16, 32) or p_EnumL
static INLINE VecU8 Encode(const VecU8& p_NibbleL, const VecU8& p_NibbleH, const VecU8& p_EnumL)
{
ASSERT(p_NibbleL[0] < 16);
ASSERT(p_NibbleH[0] < 16);
// 1,2,4,8 in lower nibble -> bitIndex + 0x80
const VecU8 tableL(0, 0, 0, 0, 0, 0, 0, 0x83, 0, 0, 0, 0x82, 0, 0x81, 0x80, 0);
const VecU8 L = Shuffle(tableL, p_NibbleL);
// 1,2,4,8 in upper nibble -> 4 * bitIndex + 0x90
const VecU8 tableH(0, 0, 0, 0, 0, 0, 0, 0x9C, 0, 0, 0, 0x98, 0, 0x94, 0x90, 0);
const VecU8 H4 = Shuffle(tableH, p_NibbleH);
const VecU8 bothNonzero = L & H4; // (only MSB valid)
const VecU8 enum5H = H4 + L; // [0, 32) after overflow
// return valid enum5, or p_EnumL (may also be zero).
return Select(bothNonzero, enum5H, p_EnumL);
}
// 1 op
// @param p_Enum5H [16, 32) or 0 for undefined result
// @return decoded 8-bit value with two 1-bits
static INLINE VecU8 Decode(const VecU8& p_Enum5H)
{
const VecU8 table(0x88, 0x84, 0x82, 0x81, 0x48, 0x44, 0x42, 0x41,
0x28, 0x24, 0x22, 0x21, 0x18, 0x14, 0x12, 0x11);
return Shuffle(table, p_Enum5H);
}
static void Test()
{
const Uint8 values[16] = { 0x11, 0x12, 0x14, 0x18, 0x21, 0x22, 0x24, 0x28,
0x41, 0x42, 0x44, 0x48, 0x81, 0x82, 0x84, 0x88 };
for (size_t val = 0; val < 256; ++val)
{
const Uint8 nibbleL = static_cast<Uint8>(val & 0xF);
const Uint8 nibbleH = static_cast<Uint8>(val >> 4);
const VecU8 encoded = Encode(VecU8(nibbleL), VecU8(nibbleH), VecU8(0xFE));
const VecU8 decoded = Decode(encoded);
const Uint8* pos = std::find(values, values + 16, val);
if (pos == values + 16) // not one of the values we can represent
{
ENSURE(encoded[0] == 0xFE);
// decoded is undefined
}
else // encoded as index
{
ENSURE(encoded[0] == 16 + pos - values);
ENSURE(decoded[0] == val);
}
}
}
};
//-----------------------------------------------------------------------------
struct U4 {};
// process pairs of nibbles so that we can reuse some of the U8 logic.
// encoding differs from U8 because all-zero is much less likely
// (only 10%, so a nonZero bit is wasteful). once again we take advantage
// of the relatively few "2 from 8" combinations and code them with
// 'fractional' bits by using 3 of the 16 possible selector values.
template<> struct Bundle<U4>
{
// (different encoding than Bundle<U8>::SingleBit so that code4 can be computed as single >> 1)
struct SingleBit
{
// @return 0 or [24, 32) if single bit
// 3 ops
static INLINE VecU8 Encode(const VecU8& p_ExclusiveL, const VecU8& p_ExclusiveH)
{
const VecU8 tableL(0, 0, 0, 0, 0, 0, 0, 27, 0, 0, 0, 26, 0, 25, 24, 0);
const VecU8 tableH(0, 0, 0, 0, 0, 0, 0, 31, 0, 0, 0, 30, 0, 29, 28, 0);
const VecU8 singleL = Shuffle(tableL, p_ExclusiveL);
const VecU8 singleH = Shuffle(tableH, p_ExclusiveH);
return singleL + singleH;
}
// 1 op
// @param p_Index [24, 32) or 0
// @return value of single bit or 0
static INLINE VecU8 Decode(const VecU8& p_Index)
{
const VecU8 powerTable(0x80, 0x40, 0x20, 0x10, 8, 4, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0);
return Shuffle(powerTable, p_Index); // = 0 if zero, both, half, raw
}
static void Test()
{
const Uint8 values[8] = { 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80 };
for (size_t val = 0; val < 256; ++val)
{
const size_t nibbleL = val & 0xF;
const size_t nibbleH = val >> 4;
const size_t exclusiveL = (nibbleH == 0) ? nibbleL : 0;
const size_t exclusiveH = (nibbleL == 0) ? nibbleH : 0;
const VecU8 encoded = Encode(VecU8(Uint8(exclusiveL)), VecU8(Uint8(exclusiveH)));
const VecU8 decoded = Decode(encoded);
const Uint8* pos = std::find(values, values + 8, val);
if (pos == values + 8) // not one of the values we can represent
{
ENSURE(encoded[0] == 0);
ENSURE(decoded[0] == 0);
}
else // encoded as index
{
ENSURE(encoded[0] == 24 + pos - values);
ENSURE(decoded[0] == val);
}
}
}
};
// 53 ops
static INLINE void Encode(const PtrR p_Values,
PtrW& p_Extra1Pos, PtrW& p_Extra2Pos, PtrW& p_Extra6Pos, const PtrW p_Code4Pos)
{
const VecU8 mask1(1);
const VecU8 mask2(3);
const VecU8 mask4(0x0F);
const VecU8 mask6(0x3F);
const auto bytes = LoadA<VecU8>(p_Values);
// isolate nibbles (must mask before 16-bit shift!)
const VecU8 maskedL = mask4 & bytes;
const VecU8 maskedH = AndNot(mask4, bytes);
const VecU8 isZeroH = (maskedL == bytes);
const VecU8 isZeroL = (maskedH == bytes);
const VecU8 nibbleH = VecU8(VecU16(maskedH) >> 4);
const VecU8 exclusiveL = maskedL & isZeroH;
const VecU8 exclusiveH = nibbleH & isZeroL;
// 3 + 5 + 3 ops
const VecU8 half = TwoBitsHalf::Encode(exclusiveL, exclusiveH); // 4..15 or 0
const VecU8 halfOrBoth = TwoBitsBoth::Encode(maskedL, nibbleH, half); // both: 16..31 or 0
const VecU8 single = SingleBit::Encode(exclusiveL, exclusiveH); // 24..31 or 0
const VecU8 isHalfOrBoth = NonZero(halfOrBoth); // bit
const VecU8 isSingle = NonZero(single); // bit
const VecU8 isRaw = AndNot((isHalfOrBoth | isSingle), NonZero(bytes)); // bit
const VecU8 codeHalfOrBoth(VecU16(AndNot(mask2, halfOrBoth)) >> 2); // 1..7 or 0
const VecU8 codeSingle(VecU16(AndNot(mask1, single)) >> 1); // 12..15 or 0
const VecU8 codeHBS = codeHalfOrBoth + codeSingle;
const VecU8 codeRaw = VecU8(8) + VecU8(VecU16(AndNot(mask6, bytes)) >> 6); // 8..11
const VecU8 code4 = Select(isRaw, codeRaw, codeHBS);
StoreA(code4, p_Code4Pos);
const VecU8 extra1 = single & mask1;
const VecU8 extra2 = halfOrBoth & mask2;
const VecU8 extra6 = bytes & mask6;
p_Extra1Pos = RemoveAndStore(extra1, isSingle, p_Extra1Pos);
p_Extra2Pos = RemoveAndStore(extra2, isHalfOrBoth, p_Extra2Pos);
p_Extra6Pos = RemoveAndStore(extra6, isRaw, p_Extra6Pos);
}
// 39 ops
static INLINE void Decode(PtrR& p_Extra1Pos, PtrR& p_Extra2Pos, PtrR& p_Extra6Pos,
const PtrR p_Code4Pos, const PtrW p_Decoded)
{
const VecU8 k3(3);
// zero: 0; half: 1..3; both: 4..7; raw: 8..11; single: 12..15
const VecU8 code4 = LoadA<VecU8>(p_Code4Pos);
// (only MSBs are valid! can only use with Select or LoadAndRestore.)
const VecU8 isRawOrSingle(VecU16(code4) << 4);
const VecU8 isHalfOrBoth = Select(isRawOrSingle, code4, NonZero(code4)); // code4 is 'zero' MSB.
const VecU8 isBoth = isHalfOrBoth & VecU8(VecI8(code4) > VecI8(k3));
const VecU8 isSingle = isRawOrSingle & (isRawOrSingle + isRawOrSingle);
const VecU8 isRaw = AndNot(isSingle, isRawOrSingle);
VecU8 extra1;
p_Extra1Pos = LoadAndRestore00(p_Extra1Pos, isSingle, extra1);
const VecU8 idxSingle = (code4 + code4) + extra1; // single: 24..31; zero: 0
VecU8 extra2;
p_Extra2Pos = LoadAndRestore00(p_Extra2Pos, isHalfOrBoth, extra2);
// half: 4..15; both: 16..31; zero: 0; single/raw: invalid
const VecU8 idxHalfOrBoth = VecU8(VecU16(code4) << 2) + extra2;
VecU8 extra6;
p_Extra6Pos = LoadAndRestore00(p_Extra6Pos, isRaw, extra6);
// zero: 0; half/both/single: invalid
const VecU8 raw = VecU8(VecU16(code4 & k3) << 6) + extra6;
// (both is invalid if code4 == 0, so only use it if isBoth.)
const VecU8 half = TwoBitsHalf::Decode(idxHalfOrBoth); // zero: 0
const VecU8 both = TwoBitsBoth::Decode(idxHalfOrBoth);
const VecU8 single = SingleBit::Decode(idxSingle); // zero: 0
const VecU8 rawOrSingle = Select(isRaw, raw, single); // zero: 0
const VecU8 halfOrBoth = Select(isBoth, both, half); // zero: 0
const VecU8 decoded = Select(isHalfOrBoth, halfOrBoth, rawOrSingle);
(void)Stream(decoded, p_Decoded);
}
static void Test()
{
SingleBit::Test();
}
};
// (don't need a Pair specialization because we don't load nonZeroBits from memory.
// code4, extra1, extra3 and extra6 are packed/unpacked into/from buffers.)
// @return false if size exceeds raw size
static INLINE bool EncodeTileShuffle4(const PtrR p_Values,
const PtrW p_Buffers, PtrW& p_PosAligned, PtrW& p_PosAny)
{
const PtrW buf1 = p_Buffers + 1 * kBufferSize;
const PtrW buf2 = p_Buffers + 2 * kBufferSize;
const PtrW buf3 = p_Buffers + 3 * kBufferSize;
PtrW posExtra1 = buf1, posExtra2 = buf2, posExtra6 = buf3;
// one code4 byte for pairs of 4-bit packets => same offset as input values
for (size_t pos4 = 0; pos4 < kTileSize; pos4 += kVectorSize)
{
Bundle<U4>::Encode(p_Values + pos4, posExtra1, posExtra2, posExtra6, p_Buffers + pos4);
}
const size_t numExtra1 = posExtra1 - buf1;
const size_t numExtra2 = posExtra2 - buf2;
const size_t numExtra6 = posExtra6 - buf3;
const size_t truncated4 = kTileSize;
const size_t truncated1 = Blocks<1>::Truncate(numExtra1);
const size_t truncated2 = Blocks<2>::Truncate(numExtra2);
const size_t truncated6 = Blocks<6>::Truncate(numExtra6);
// (remainders4 = 0)
const size_t remainders1 = numExtra1 - truncated1;
const size_t remainders2 = numExtra2 - truncated2;
const size_t remainders6 = numExtra6 - truncated6;
const size_t sizeAny = 6 +
Remainders<1>::PackedSize(remainders1) +
Remainders<2>::PackedSize(remainders2) +
Remainders<6>::PackedSize(remainders6);
const size_t sizeAligned =
Blocks<4>::PackedSize(truncated4) +
Blocks<1>::PackedSize(truncated1) +
Blocks<2>::PackedSize(truncated2) +
Blocks<6>::PackedSize(truncated6);
ASSERT((numExtra1 | numExtra2 | numExtra6) != 0);
if (sizeAligned + sizeAny >= kTileSize) return false;
const Uint64 num621 = (numExtra6 << 32) + (numExtra2 << 16) + numExtra1;
p_PosAny = StoreFrom(&num621, 6, p_PosAny);
p_PosAny = Remainders<1>::Pack(RFromW(posExtra1) - remainders1, remainders1, p_PosAny);
p_PosAny = Remainders<2>::Pack(RFromW(posExtra2) - remainders2, remainders2, p_PosAny);
p_PosAny = Remainders<6>::Pack(RFromW(posExtra6) - remainders6, remainders6, p_PosAny);
p_PosAligned = Blocks<4>::Pack(RFromW(p_Buffers), truncated4, p_PosAligned);
p_PosAligned = Blocks<1>::Pack(RFromW(buf1), truncated1, p_PosAligned);
p_PosAligned = Blocks<2>::Pack(RFromW(buf2), truncated2, p_PosAligned);
p_PosAligned = Blocks<6>::Pack(RFromW(buf3), truncated6, p_PosAligned);
return true;
}
static INLINE void DecodeTileShuffle4(
PtrR& p_PosAligned, PtrR& p_PosAny, const PtrW p_Buffers, const PtrW p_Out)
{
const Uint64 num621 = Load<Uint64>(p_PosAny); p_PosAny += 6;
const size_t numExtra1 = num621 & 0xFFFF;
const size_t numExtra2 = (num621 >> 16) & 0xFFFF;
const size_t numExtra6 = (num621 >> 32) & 0xFFFF;
const size_t truncated4 = kTileSize;
const size_t truncated1 = Blocks<1>::Truncate(numExtra1);
const size_t truncated2 = Blocks<2>::Truncate(numExtra2);
const size_t truncated6 = Blocks<6>::Truncate(numExtra6);
// (remainders4 = 0)
const size_t remainders1 = numExtra1 - truncated1;
const size_t remainders2 = numExtra2 - truncated2;
const size_t remainders6 = numExtra6 - truncated6;
const PtrW buf1 = p_Buffers + 1 * kBufferSize;
const PtrW buf2 = p_Buffers + 2 * kBufferSize;
const PtrW buf3 = p_Buffers + 3 * kBufferSize;
p_PosAligned = Blocks<4>::Unpack(p_PosAligned, truncated4, p_Buffers);
p_PosAligned = Blocks<1>::Unpack(p_PosAligned, truncated1, buf1);
p_PosAligned = Blocks<2>::Unpack(p_PosAligned, truncated2, buf2);
p_PosAligned = Blocks<6>::Unpack(p_PosAligned, truncated6, buf3);
p_PosAny = Remainders<1>::Unpack(p_PosAny, remainders1, buf1 + truncated1);
p_PosAny = Remainders<2>::Unpack(p_PosAny, remainders2, buf2 + truncated2);
p_PosAny = Remainders<6>::Unpack(p_PosAny, remainders6, buf3 + truncated6);
PtrR posExtra1 = RFromW(buf1), posExtra2 = RFromW(buf2), posExtra6 = RFromW(buf3);
for (size_t pos4 = 0; pos4 < kTileSize; pos4 += kVectorSize)
{
Bundle<U4>::Decode(posExtra1, posExtra2, posExtra6, RFromW(p_Buffers) + pos4, p_Out + pos4);
}
}
//-----------------------------------------------------------------------------
// for randomly generated bit planes, only 81% of T=U8 packets have zero or
// one bits set, and 9% have 3 or more. for reasons of both size and speed,
// we need to handle these dense cases by encoding the Uint8 value "raw".
//
// # code4 extra2 raw
// 0: 0 0 0
// 1: 8-F 0 0
// 2: 1-7 0-3 0
// R: 0 0 0-FF
template<> struct Bundle<Uint8>
{
struct SingleBit
{
// @return [8,16) if single bit, otherwise 0
// 3 ops
static INLINE VecU8 Encode(const VecU8& p_ExclusiveL, const VecU8& p_ExclusiveH)
{
const VecU8 tableL(0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 10, 0, 9, 8, 0);
const VecU8 tableH(0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 14, 0, 13, 12, 0);
const VecU8 singleL = Shuffle(tableL, p_ExclusiveL);
const VecU8 singleH = Shuffle(tableH, p_ExclusiveH);
return singleL + singleH;
}
// 1 op
// @param p_Index [0, 16)
// @return if p_Index = [8,16), return value of single bit, otherwise 0
static INLINE VecU8 Decode(const VecU8& p_Index)
{
const VecU8 powerTable(0x80, 0x40, 0x20, 0x10, 8, 4, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0);
return Shuffle(powerTable, p_Index); // = 0 if zero, enum5 or raw8
}
static void Test()
{
const Uint8 values[8] = { 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80 };
for (size_t val = 0; val < 256; ++val)
{
const size_t nibbleL = val & 0xF;
const size_t nibbleH = val >> 4;
const size_t exclusiveL = (nibbleH == 0) ? nibbleL : 0;
const size_t exclusiveH = (nibbleL == 0) ? nibbleH : 0;
const VecU8 encoded = Encode(VecU8(Uint8(exclusiveL)), VecU8(Uint8(exclusiveH)));
const VecU8 decoded = Decode(encoded);
const Uint8* pos = std::find(values, values + 8, val);
if (pos == values + 8) // not one of the values we can represent
{
ENSURE(encoded[0] == 0);
ENSURE(decoded[0] == 0);
}
else // encoded as index
{
ENSURE(encoded[0] == 8 + pos - values);
ENSURE(decoded[0] == val);
}
}
}
};
// 21 ops
static INLINE void ComputeEnum5AndCode4(const VecU8& p_Values, VecU8& p_Enum5, VecU8& p_Code4)
{
// isolate nibbles (must mask before 16-bit shift!)
const VecU8 mask0F(0x0F);
const VecU8 maskedL = mask0F & p_Values;
const VecU8 maskedH = AndNot(mask0F, p_Values);
const VecU8 isZeroH = (maskedL == p_Values);
const VecU8 isZeroL = (maskedH == p_Values);
const VecU8 nibbleH = VecU8(VecU16(maskedH) >> 4);
const VecU8 exclusiveL = maskedL & isZeroH;
const VecU8 exclusiveH = nibbleH & isZeroL;
const VecU8 enum5L = TwoBitsHalf::Encode(exclusiveL, exclusiveH);
p_Enum5 = TwoBitsBoth::Encode(maskedL, nibbleH, enum5L);
const VecU8 single = SingleBit::Encode(exclusiveL, exclusiveH);
p_Code4 = (VecU8(VecU16(p_Enum5) >> 2) & mask0F) + single;
}
// 6 ops
static INLINE VecU8 ValueFromEnum5AndCode4(const VecU8& p_Enum5, const VecU8& p_Code4,
const VecU8& p_IsTwoBit)
{
const VecU8 decodedHalf = TwoBitsHalf::Decode(p_Enum5); // = 0 if zero or code4 or raw8
const VecU8 decodedBoth = TwoBitsBoth::Decode(p_Enum5);
const VecU8 isBoth(VecI8(p_Enum5) > VecI8(0x0F));
const VecU8 twoBit = Select(isBoth, decodedBoth, decodedHalf);
const VecU8 oneBit = SingleBit::Decode(p_Code4); // = 0 if zero, enum5 or raw8
return Select(p_IsTwoBit, twoBit, oneBit);
}
// 43 ops (previous 12R: 59)
// @return nonZeroBits
static INLINE size_t Encode(const PtrR p_Values,
PtrW& p_Code4Pos, PtrW& p_Extra2Pos, PtrW& p_Raw8Pos)
{
const auto values = LoadA<VecU8>(p_Values);
const size_t nonZeroBits = _mm_movemask_epi8(NonZero(values));
VecU8 enum5, code4;
ComputeEnum5AndCode4(values, enum5, code4);
// 20 ops
p_Code4Pos = RemoveAndStore(code4, nonZeroBits, p_Code4Pos);
const size_t extra2Bits = _mm_movemask_epi8(NonZero(enum5));
const VecU8 extra2 = enum5 & VecU8(3);
p_Extra2Pos = RemoveAndStore(extra2, extra2Bits, p_Extra2Pos);
const size_t raw8Bits = Raw8Bits(nonZeroBits, code4);
p_Raw8Pos = RemoveAndStore(values, raw8Bits, p_Raw8Pos);
return nonZeroBits;
}
// 2 ops
static INLINE size_t Raw8Bits(size_t p_NonZeroBits, const VecU8& p_Code4)
{
const VecU8 zero(_mm_setzero_si128());
return p_NonZeroBits & _mm_movemask_epi8(p_Code4 == zero);
}
// 29 ops (previous 12R: 40)
static INLINE void Decode(size_t p_NonZeroBits,
PtrR& p_Code4Pos, PtrR& p_Extra2Pos, PtrR& p_Raw8Pos, const PtrW p_Decoded)
{
VecU8 code4; // = 0 if zero or raw8
p_Code4Pos = LoadAndRestore00(p_Code4Pos, p_NonZeroBits, code4);
VecU8 extra2; // = 0 if zero or code4 or raw8
const VecU8 extra2Table(0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0);
const VecU8 extra2Mask = Shuffle(extra2Table, code4);
p_Extra2Pos = LoadAndRestore00(p_Extra2Pos, extra2Mask, extra2);
const VecU8 enum5 = VecU8(VecU16(code4 & extra2Mask) << 2) + extra2; // = 0 if zero, code4 or raw8
VecU8 decoded; // = 0 if zero or code4 or enum5
const size_t raw8Bits = Raw8Bits(p_NonZeroBits, code4);
p_Raw8Pos = LoadAndRestore00(p_Raw8Pos, raw8Bits, decoded);
decoded += ValueFromEnum5AndCode4(enum5, code4, extra2Mask);
(void)Stream(decoded, p_Decoded);
}
static void Test()
{
TwoBitsHalf::Test();
TwoBitsBoth::Test();
SingleBit::Test();
{
VecU8 enum5(_mm_setzero_si128()), code4(_mm_setzero_si128()), extra2Mask(_mm_setzero_si128());
const VecU8 decoded = ValueFromEnum5AndCode4(enum5, code4, extra2Mask);
ENSURE(decoded[0] == 0);
}
for (size_t val = 0; val < 256; ++val)
{
VecU8 enum5, code4;
ComputeEnum5AndCode4(VecU8(Uint8(val)), enum5, code4);
const size_t bits = PopulationCount64(val);
if (bits != 1 && bits != 2)
{
ENSURE(enum5[0] == 0);
ENSURE(code4[0] == 0);
}
else
{
const VecU8 extra2Table(0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0);
const VecU8 extra2Mask = Shuffle(extra2Table, code4);
const VecU8 decoded = ValueFromEnum5AndCode4(enum5, code4, extra2Mask);
ENSURE(decoded[0] == val);
}
}
}
};
// (still want to write 32 nonZeroBits at a time => use Pair interface)
template<> struct Pair<Uint8>
{
static INLINE void Encode(const PtrR p_Values,
PtrW& p_Pos4, PtrW& p_Pos2, PtrW& p_Pos8, const PtrW p_NonZeroPos)
{
const size_t nonZeroBitsL = Bundle<Uint8>::Encode(p_Values, p_Pos4, p_Pos2, p_Pos8);
const size_t nonZeroBitsH = Bundle<Uint8>::Encode(p_Values + kVectorSize, p_Pos4, p_Pos2, p_Pos8);
const size_t nonZeroBits = (nonZeroBitsH << 16) + nonZeroBitsL;
(void)Store(static_cast<NonZeroBits>(nonZeroBits), p_NonZeroPos);
}
static INLINE void Decode(PtrR& p_Pos4, PtrR& p_Pos2, PtrR& p_Pos8, const PtrR p_NonZeroPos,
const PtrW p_Decoded)
{
NonZeroBits nonZeroBits;
(void)Load(p_NonZeroPos, nonZeroBits);
const size_t nonZeroBitsL = nonZeroBits & 0xFFFF;
const size_t nonZeroBitsH = nonZeroBits >> 16;
Bundle<Uint8>::Decode(nonZeroBitsL, p_Pos4, p_Pos2, p_Pos8, p_Decoded);
Bundle<Uint8>::Decode(nonZeroBitsH, p_Pos4, p_Pos2, p_Pos8, p_Decoded + kVectorSize);
}
};
template<> INLINE void EncodeTileShuffle<Uint8>(const PtrR p_Values,
const PtrW p_Buffers, PtrW& p_PosAligned, PtrW& p_PosAny)
{
// (four 16-bit positions packed into 64 bits are compact but slow.
// a reference to a mutable array of pointers is fairly fast,
// but named parameters are easier to understand and slightly faster.)
const PtrW buf1 = p_Buffers + 1 * kBufferSize;
const PtrW buf2 = p_Buffers + 2 * kBufferSize;
const PtrW buf3 = p_Buffers + 3 * kBufferSize;
PtrW pos4 = buf1, pos2 = buf2;
PtrW pos8 = buf3;
const size_t sizeNonZero = kTileSize / CHAR_BIT;
for (size_t ofs = 0; ofs < sizeNonZero; ofs += sizeof(NonZeroBits))
{
Pair<Uint8>::Encode(p_Values + ofs * CHAR_BIT, pos4, pos2, pos8, p_Buffers + ofs);
}
const size_t num4 = pos4 - buf1;
const size_t num2 = pos2 - buf2;
const size_t num8 = pos8 - buf3;
const size_t truncated4 = Blocks<4>::Truncate(num4);
const size_t truncated2 = Blocks<2>::Truncate(num2);
const size_t remainders4 = num4 - truncated4;
const size_t remainders2 = num2 - truncated2;
const size_t sizeAny = 6 + num8 +
Remainders<4>::PackedSize(remainders4) +
Remainders<2>::PackedSize(remainders2);
const size_t sizeAligned =
Blocks<4>::PackedSize(truncated4) +
Blocks<2>::PackedSize(truncated2);
ASSERT((num4 + num2 + num8) != 0);
ASSERT(sizeNonZero + sizeAny + sizeAligned < kTileSize);
const Uint64 num248 = (num2 << 32) + (num4 << 16) + num8;
p_PosAny = StoreFrom(&num248, 6, p_PosAny);
// write contiguous raw8 to avoid copying in the decoder.
// (unfortunately we cannot stream to p_PosAny due to misalignment)
p_PosAny = StoreFrom(buf3, num8, p_PosAny);
p_PosAny = Remainders<4>::Pack(RFromW(pos4) - remainders4, remainders4, p_PosAny);
p_PosAny = Remainders<2>::Pack(RFromW(pos2) - remainders2, remainders2, p_PosAny);
p_PosAligned = Stream64From(RFromW(p_Buffers), sizeNonZero, p_PosAligned);
p_PosAligned = Blocks<4>::Pack(RFromW(buf1), truncated4, p_PosAligned);
p_PosAligned = Blocks<2>::Pack(RFromW(buf2), truncated2, p_PosAligned);
}
template<> INLINE void DecodeTileShuffle<Uint8>(
PtrR& p_PosAligned, PtrR& p_PosAny, const PtrW p_Buffers, const PtrW p_Out)
{
const size_t sizeNonZero = kTileSize / CHAR_BIT;
const Uint64 num248 = Load<Uint64>(p_PosAny); p_PosAny += 6;
const size_t num8 = num248 & 0xFFFF;
const size_t num4 = (num248 >> 16) & 0xFFFF;
const size_t num2 = (num248 >> 32) & 0xFFFF;
const size_t truncated4 = Blocks<4>::Truncate(num4);
const size_t truncated2 = Blocks<2>::Truncate(num2);
const size_t remainders4 = num4 - truncated4;
const size_t remainders2 = num2 - truncated2;
const size_t sizeAny = 6 + num8 +
Remainders<4>::PackedSize(remainders4) +
Remainders<2>::PackedSize(remainders2);
const PtrR posNonZero = p_PosAligned; p_PosAligned += sizeNonZero;
const PtrW buf1 = p_Buffers + 1 * kBufferSize;
p_PosAligned = Blocks<4>::Unpack(p_PosAligned, truncated4, p_Buffers);
p_PosAligned = Blocks<2>::Unpack(p_PosAligned, truncated2, buf1);
PtrR pos8 = p_PosAny; p_PosAny += num8;
p_PosAny = Remainders<4>::Unpack(p_PosAny, remainders4, p_Buffers + truncated4);
p_PosAny = Remainders<2>::Unpack(p_PosAny, remainders2, buf1 + truncated2);
PtrR pos4 = RFromW(p_Buffers);
PtrR pos2 = RFromW(buf1);
for (size_t ofs = 0; ofs < sizeNonZero; ofs += sizeof(NonZeroBits))
{
Pair<Uint8>::Decode(pos4, pos2, pos8, posNonZero + ofs, p_Out + ofs * CHAR_BIT);
}
}
} // namespace codec