Skip to content

Commit

Permalink
Merge branch 'master' of github.com:lemire/simdcomp
Browse files Browse the repository at this point in the history
  • Loading branch information
lemire committed Oct 7, 2015
2 parents c5d2fed + 3ed1fce commit dac115f
Show file tree
Hide file tree
Showing 20 changed files with 27,478 additions and 4,189 deletions.
57 changes: 43 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
The SIMDComp library
The SIMDComp library
====================
[![Build Status](https://travis-ci.org/lemire/simdcomp.png)](https://travis-ci.org/lemire/simdcomp)

A simple C library for compressing lists of integers using binary packing and SIMD instructions.
The assumption is either that you have a list of 32-bit integers where most of them are small, or a list of 32-bit integers where differences between successive integers are small. No software is able to reliably compress an array of random numbers.
The assumption is either that you have a list of 32-bit integers where most of them are small, or a list of 32-bit integers where differences between successive integers are small. No software is able to reliably compress an array of 32-bit random numbers.

This library can decode at least 4 billions of compressed integers per second on most
desktop or laptop processors. That is, it can decompress data at a rate of 15 GB/s.
Expand All @@ -19,7 +19,7 @@ format. It is up to the (sophisticated) user to create a compressed format.
Requirements
-------------

- Your processor should support SSE2 (Pentium4 or better)
- Your processor should support SSE4.1 (It is supported by most Intel and AMD processors released since 2008.)
- C99 compliant compiler (GCC is assumed)
- A Linux-like distribution is assumed by the makefile

Expand All @@ -35,20 +35,51 @@ run it with "make example; ./example").

1) Lists of integers in random order.

const uint32_t b = maxbits(datain);// computes bit width
simdpackwithoutmask(datain, buffer, b);//compressed to buffer
simdunpack(buffer, backbuffer, b);//uncompressed to backbuffer
```C
const uint32_t b = maxbits(datain);// computes bit width
simdpackwithoutmask(datain, buffer, b);//compressed to buffer
simdunpack(buffer, backbuffer, b);//uncompressed to backbuffer
```
While 128 32-bit integers are read, only b 128-bit words are written. Thus, the compression ratio is 32/b.
2) Sorted lists of integers.
We used differential coding: we store the difference between successive integers. For this purpose, we need an initial value (called offset).

uint32_t offset = 0;
uint32_t b1 = simdmaxbitsd1(offset,datain); // bit width
simdpackwithoutmaskd1(offset, datain, buffer, b1);//compressed
simdunpackd1(offset, buffer, backbuffer, b1);//uncompressed
```C
uint32_t offset = 0;
uint32_t b1 = simdmaxbitsd1(offset,datain); // bit width
simdpackwithoutmaskd1(offset, datain, buffer, b1);//compressed
simdunpackd1(offset, buffer, backbuffer, b1);//uncompressed
```

General example for arrays of arbitrary length:
```C
int compress_decompress_demo() {
size_t k, N = 9999;
uint32_t * datain = malloc(N * sizeof(uint32_t));
uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize);
uint32_t * backbuffer = malloc(N * sizeof(uint32_t));
uint32_t b;

for (k = 0; k < N; ++k){ /* start with k=0, not k=1! */
datain[k] = k;
}

b = maxbits_length(datain, N);
simdpack_length(datain, N, (__m128i *)buffer, b);
simdunpack_length((const __m128i *)buffer, N, backbuffer, b);

for (k = 0; k < N; ++k){
if(datain[k] != backbuffer[k]) {
printf("bug\n");
return -1;
}
}
return 0;
}
```

Setup
---------
Expand All @@ -59,7 +90,7 @@ make test

and if you are daring:

make install
make install

Go
--------
Expand All @@ -83,5 +114,3 @@ References
* Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second through vectorization, Software Practice & Experience 45 (1), 2015. http://arxiv.org/abs/1209.2137 http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract
* Jeff Plaisance, Nathan Kurz, Daniel Lemire, Vectorized VByte Decoding, International Symposium on Web Algorithms 2015, 2015. http://arxiv.org/abs/1503.07387
* Wayne Xin Zhao, Xudong Zhang, Daniel Lemire, Dongdong Shan, Jian-Yun Nie, Hongfei Yan, Ji-Rong Wen, A General SIMD-based Approach to Accelerating Compression Algorithms, ACM Transactions on Information Systems 33 (3), 2015. http://arxiv.org/abs/1502.01916


197 changes: 143 additions & 54 deletions example.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,42 @@
#include <stdlib.h>
#include "simdcomp.h"

/**
We provide several different code examples.
**/

/* compresses data from datain to buffer, returns how many bytes written */

/* very simple test to illustrate a simple application */
int compress_decompress_demo() {
size_t k, N = 9999;
uint32_t * datain = malloc(N * sizeof(uint32_t));
uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize);
uint32_t * backbuffer = malloc(N * sizeof(uint32_t));
uint32_t b;
printf("== simple test\n");

for (k = 0; k < N; ++k){ /* start with k=0, not k=1! */
datain[k] = k;
}

b = maxbits_length(datain, N);
simdpack_length(datain, N, (__m128i *)buffer, b);
simdunpack_length((const __m128i *)buffer, N, backbuffer, b);

for (k = 0; k < N; ++k){
if(datain[k] != backbuffer[k]) {
printf("bug at %lu \n",(unsigned long)k);
return -1;
}
}
printf("Code works!\n");
return 0;
}



/* compresses data from datain to buffer, returns how many bytes written
used below in simple_demo */
size_t compress(uint32_t * datain, size_t length, uint8_t * buffer) {
uint32_t offset;
uint8_t * initout;
Expand All @@ -26,60 +60,115 @@ size_t compress(uint32_t * datain, size_t length, uint8_t * buffer) {
return buffer - initout;
}

/* Another illustration ... */
void simple_demo() {
size_t REPEAT = 10, gap;
size_t N = 1000000 * SIMDBlockSize;/* SIMDBlockSize is 128 */
uint32_t * datain = malloc(N * sizeof(uint32_t));
size_t compsize;
clock_t start, end;
uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize); /* output buffer */
uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
printf("== simple demo\n");
for (gap = 1; gap <= 243; gap *= 3) {
size_t k, repeat;
uint32_t offset = 0;
uint32_t bogus = 0;
double numberofseconds;

int main() {
int REPEAT = 10, gap;
size_t N = 1000000 * SIMDBlockSize;/* SIMDBlockSize is 128 */
uint32_t * datain = malloc(N * sizeof(uint32_t));
size_t compsize;
clock_t start, end;
uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize); /* output buffer */
uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
for (gap = 1; gap <= 243; gap *= 3) {
int k, repeat;
uint32_t offset = 0;
uint32_t bogus = 0;
double numberofseconds;

printf("\n");
printf(" gap = %u \n", gap);
datain[0] = 0;
for (k = 1; k < N; ++k)
datain[k] = datain[k-1] + ( rand() % (gap + 1) );
compsize = compress(datain,N,buffer);
printf("compression ratio = %f \n", (N * sizeof(uint32_t))/ (compsize * 1.0 ));
start = clock();
for(repeat = 0; repeat < REPEAT; ++repeat) {
uint8_t * decbuffer = buffer;
for (k = 0; k * SIMDBlockSize < N; ++k) {
uint8_t b = *decbuffer++;
simdunpackd1(offset, (__m128i *) decbuffer, backbuffer, b);
/* do something here with backbuffer */
bogus += backbuffer[3];
decbuffer += b * sizeof(__m128i);
offset = backbuffer[SIMDBlockSize - 1];
}
}
end = clock();
numberofseconds = (end-start)/(double)CLOCKS_PER_SEC;
printf("decoding speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0));
start = clock();
for(repeat = 0; repeat < REPEAT; ++repeat) {
uint8_t * decbuffer = buffer;
for (k = 0; k * SIMDBlockSize < N; ++k) {
memcpy(backbuffer,decbuffer+k*SIMDBlockSize,SIMDBlockSize*sizeof(uint32_t));
bogus += backbuffer[3] - backbuffer[100];
}
}
end = clock();
numberofseconds = (end-start)/(double)CLOCKS_PER_SEC;
printf("memcpy speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0));
printf("ignore me %i \n",bogus);
printf("All tests are in CPU cache. Avoid out-of-cache decoding in applications.\n");
printf("\n");
printf(" gap = %lu \n", (unsigned long) gap);
datain[0] = 0;
for (k = 1; k < N; ++k)
datain[k] = datain[k-1] + ( rand() % (gap + 1) );
compsize = compress(datain,N,buffer);
printf("compression ratio = %f \n", (N * sizeof(uint32_t))/ (compsize * 1.0 ));
start = clock();
for(repeat = 0; repeat < REPEAT; ++repeat) {
uint8_t * decbuffer = buffer;
for (k = 0; k * SIMDBlockSize < N; ++k) {
uint8_t b = *decbuffer++;
simdunpackd1(offset, (__m128i *) decbuffer, backbuffer, b);
/* do something here with backbuffer */
bogus += backbuffer[3];
decbuffer += b * sizeof(__m128i);
offset = backbuffer[SIMDBlockSize - 1];
}
}
end = clock();
numberofseconds = (end-start)/(double)CLOCKS_PER_SEC;
printf("decoding speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0));
start = clock();
for(repeat = 0; repeat < REPEAT; ++repeat) {
uint8_t * decbuffer = buffer;
for (k = 0; k * SIMDBlockSize < N; ++k) {
memcpy(backbuffer,decbuffer+k*SIMDBlockSize,SIMDBlockSize*sizeof(uint32_t));
bogus += backbuffer[3] - backbuffer[100];
}
}
end = clock();
numberofseconds = (end-start)/(double)CLOCKS_PER_SEC;
printf("memcpy speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0));
printf("ignore me %i \n",bogus);
printf("All tests are in CPU cache. Avoid out-of-cache decoding in applications.\n");
}
free(buffer);
free(datain);
free(backbuffer);
}

/* Used below in more_sophisticated_demo ... */
size_t varying_bit_width_compress(uint32_t * datain, size_t length, uint8_t * buffer) {
uint8_t * initout;
size_t k;
if(length/SIMDBlockSize*SIMDBlockSize != length) {
printf("Data length should be a multiple of %i \n",SIMDBlockSize);
}
free(buffer);
free(datain);
free(backbuffer);
return 0;
initout = buffer;
for(k = 0; k < length / SIMDBlockSize; ++k) {
uint32_t b = maxbits(datain);
*buffer++ = b;
simdpackwithoutmask(datain, (__m128i *)buffer, b);
datain += SIMDBlockSize;
buffer += b * sizeof(__m128i);
}
return buffer - initout;
}

/* Here we compress the data in blocks of 128 integers with varying bit width */
int varying_bit_width_demo() {
size_t nn = 128 * 2;
uint32_t * datainn = malloc(nn * sizeof(uint32_t));
uint8_t * buffern = malloc(nn * sizeof(uint32_t) + nn / SIMDBlockSize);
uint32_t * backbuffern = malloc(nn * sizeof(uint32_t));
size_t k, compsize;
printf("== varying bit-width demo\n");

for(k=0;k<nn;++k){
datainn[k] = rand() % (k + 1);
}

compsize = varying_bit_width_compress(datainn,nn,buffern);
printf("encoded size: %u (original size: %u)\n", (unsigned)compsize,
(unsigned)(nn * sizeof(uint32_t)));

for (k = 0; k * SIMDBlockSize < nn; ++k) {
uint32_t b = *buffern;
buffern++;
simdunpack((const __m128i *)buffern, backbuffern + k * SIMDBlockSize, b);
buffern += b * sizeof(__m128i);
}

for (k = 0; k < nn; ++k){
if(backbuffern[k] != datainn[k]) { printf("bug\n"); return -1;}
}
printf("Code works!\n");
return 0;
}

int main() {
if(compress_decompress_demo() != 0) return -1;
if(varying_bit_width_demo() != 0) return -1;
simple_demo();
return 0;
}
11 changes: 6 additions & 5 deletions include/portability.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#define SIMDBITCOMPAT_H_

#include <iso646.h> /* mostly for Microsoft compilers */
#include <stdint.h> /* part of Visual Studio 2010 and better */
#include <string.h>

#if SIMDCOMP_DEBUG
# define SIMDCOMP_ALWAYS_INLINE inline
Expand Down Expand Up @@ -45,12 +45,13 @@
# endif
#endif

/*
* typedefs really defeat the purpose of uint32_t and uint8_t. Modern C compilers
* should not need that:
#if defined(_MSC_VER) && _MSC_VER < 1600
typedef unsigned int uint32_t;
typedef unsigned char uint8_t;
*/
typedef signed char int8_t;
#else
#include <stdint.h> /* part of Visual Studio 2010 and better, others likely anyway */
#endif

#if defined(_MSC_VER)
#define SIMDCOMP_ALIGNED(x) __declspec(align(x))
Expand Down
28 changes: 28 additions & 0 deletions include/simdbitpacking.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
/* for memset */
#include <string.h>

#include "simdcomputil.h"


/* reads 128 values from "in", writes "bit" 128-bit vectors to "out" */
void simdpack(const uint32_t * in,__m128i * out, const uint32_t bit);

Expand All @@ -20,5 +23,30 @@ void simdpackwithoutmask(const uint32_t * in,__m128i * out, const uint32_t bit
/* reads "bit" 128-bit vectors from "in", writes 128 values to "out" */
void simdunpack(const __m128i * in,uint32_t * out, const uint32_t bit);

/* like simdpack, but supports an undetermined number of inputs.
* This is useful if you need to unpack an array of integers that is not divisible by 128 integers.
* Returns a pointer to the (advanced) compressed array. */
__m128i * simdpack_length(const uint32_t * in, size_t length, __m128i * out, const uint32_t bit);

/* like simdunpack, but supports an undetermined number of inputs.
* This is useful if you need to unpack an array of integers that is not divisible by 128 integers.
* Returns a pointer to the (advanced) compressed array. */
const __m128i * simdunpack_length(const __m128i * in, size_t length, uint32_t * out, const uint32_t bit);




/* like simdpack, but supports an undetermined small number of inputs. This is useful if you need to pack less than 128 integers.
* Note that this function is much slower.
* Returns a pointer to the (advanced) compressed array. */
__m128i * simdpack_shortlength(const uint32_t * in, int length, __m128i * out, const uint32_t bit);

/* like simdunpack, but supports an undetermined small number of inputs. This is useful if you need to unpack less than 128 integers.
* Note that this function is much slower.
* Returns a pointer to the (advanced) compressed array. */
const __m128i * simdunpack_shortlength(const __m128i * in, int length, uint32_t * out, const uint32_t bit);

/* given a block of 128 packed values, this function sets the value at index "index" to "value" */
void simdfastset(__m128i * in128, uint32_t b, uint32_t value, size_t index);

#endif /* SIMDBITPACKING_H_ */
1 change: 1 addition & 0 deletions include/simdcomp.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ extern "C" {

#include "simdbitpacking.h"
#include "simdcomputil.h"
#include "simdfor.h"
#include "simdintegratedbitpacking.h"

#ifdef __cplusplus
Expand Down
Loading

0 comments on commit dac115f

Please sign in to comment.