diff --git a/examples/quiz1/quiz1/quiz1.c b/examples/quiz1/quiz1/quiz1.c index 5861f1ce..4c6f010e 100644 --- a/examples/quiz1/quiz1/quiz1.c +++ b/examples/quiz1/quiz1/quiz1.c @@ -9,11 +9,11 @@ // inp : f32[N] @DRAM, // out : f32[N] @DRAM // ) -void vec_double( void *ctxt, int_fast32_t N, const float* inp, float* out ) { -EXO_ASSUME(N % 8 == 0); -for (int_fast32_t i = 0; i < N; i++) { - out[i] = 2.0f * inp[i]; -} +void vec_double(void *ctxt, int_fast32_t N, const float *inp, float *out) { + EXO_ASSUME(N % 8 == 0); + for (int_fast32_t i = 0; i < N; i++) { + out[i] = 2.0f * inp[i]; + } } // vec_double_optimized( @@ -21,20 +21,20 @@ for (int_fast32_t i = 0; i < N; i++) { // inp : f32[N] @DRAM, // out : f32[N] @DRAM // ) -void vec_double_optimized( void *ctxt, int_fast32_t N, const float* inp, float* out ) { -EXO_ASSUME(N % 8 == 0); -__m256 two_vec; -two_vec = _mm256_broadcast_ss(2.0); -for (int_fast32_t io = 0; io < ((N) / (8)); io++) { - __m256 out_vec; - __m256 inp_vec; - inp_vec = _mm256_loadu_ps(&inp[8 * io]); - out_vec = _mm256_mul_ps(two_vec, inp_vec); - _mm256_storeu_ps(&out[8 * io], out_vec); -} +void vec_double_optimized( + void *ctxt, int_fast32_t N, const float *inp, float *out) { + EXO_ASSUME(N % 8 == 0); + __m256 two_vec; + two_vec = _mm256_broadcast_ss(2.0); + for (int_fast32_t io = 0; io < ((N) / (8)); io++) { + __m256 out_vec; + __m256 inp_vec; + inp_vec = _mm256_loadu_ps(&inp[8 * io]); + out_vec = _mm256_mul_ps(two_vec, inp_vec); + _mm256_storeu_ps(&out[8 * io], out_vec); + } } - /* relying on the following instruction..." vector_assign_two(out) {out_data} = _mm256_broadcast_ss(2.0); diff --git a/examples/quiz1/quiz1/quiz1.h b/examples/quiz1/quiz1/quiz1.h index 534debbf..bb91cfcd 100644 --- a/examples/quiz1/quiz1/quiz1.h +++ b/examples/quiz1/quiz1/quiz1.h @@ -7,41 +7,38 @@ extern "C" { #endif - -#include #include +#include // Compiler feature macros adapted from Hedley (public domain) // https://github.com/nemequ/hedley #if defined(__has_builtin) -# define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) +#define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) #else -# define EXO_HAS_BUILTIN(builtin) (0) +#define EXO_HAS_BUILTIN(builtin) (0) #endif #if EXO_HAS_BUILTIN(__builtin_assume) -# define EXO_ASSUME(expr) __builtin_assume(expr) +#define EXO_ASSUME(expr) __builtin_assume(expr) #elif EXO_HAS_BUILTIN(__builtin_unreachable) -# define EXO_ASSUME(expr) \ - ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) +#define EXO_ASSUME(expr) ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) #else -# define EXO_ASSUME(expr) ((void)(expr)) +#define EXO_ASSUME(expr) ((void)(expr)) #endif - #ifndef EXO_WIN_1F32 #define EXO_WIN_1F32 -struct exo_win_1f32{ - float * const data; - const int_fast32_t strides[1]; +struct exo_win_1f32 { + float *const data; + const int_fast32_t strides[1]; }; #endif #ifndef EXO_WIN_1F32C #define EXO_WIN_1F32C -struct exo_win_1f32c{ - const float * const data; - const int_fast32_t strides[1]; +struct exo_win_1f32c { + const float *const data; + const int_fast32_t strides[1]; }; #endif // vec_double( @@ -49,16 +46,15 @@ struct exo_win_1f32c{ // inp : f32[N] @DRAM, // out : f32[N] @DRAM // ) -void vec_double( void *ctxt, int_fast32_t N, const float* inp, float* out ); +void vec_double(void *ctxt, int_fast32_t N, const float *inp, float *out); // vec_double_optimized( // N : size, // inp : f32[N] @DRAM, // out : f32[N] @DRAM // ) -void vec_double_optimized( void *ctxt, int_fast32_t N, const float* inp, float* out ); - - +void vec_double_optimized( + void *ctxt, int_fast32_t N, const float *inp, float *out); #ifdef __cplusplus } diff --git a/examples/quiz2/quiz2/quiz2.c b/examples/quiz2/quiz2/quiz2.c index abc4efdd..7059c10b 100644 --- a/examples/quiz2/quiz2/quiz2.c +++ b/examples/quiz2/quiz2/quiz2.c @@ -9,11 +9,12 @@ // b : f32[N] @DRAM, // c : f32[N] @DRAM // ) -void scaled_add( void *ctxt, int_fast32_t N, const float* a, const float* b, float* c ) { -EXO_ASSUME(N % 8 == 0); -for (int_fast32_t i = 0; i < N; i++) { - c[i] = 2.0f * a[i] + 3.0f * b[i]; -} +void scaled_add( + void *ctxt, int_fast32_t N, const float *a, const float *b, float *c) { + EXO_ASSUME(N % 8 == 0); + for (int_fast32_t i = 0; i < N; i++) { + c[i] = 2.0f * a[i] + 3.0f * b[i]; + } } // scaled_add_scheduled( @@ -22,42 +23,42 @@ for (int_fast32_t i = 0; i < N; i++) { // b : f32[N] @DRAM, // c : f32[N] @DRAM // ) -void scaled_add_scheduled( void *ctxt, int_fast32_t N, const float* a, const float* b, float* c ) { -EXO_ASSUME(N % 8 == 0); -for (int_fast32_t io = 0; io < ((N) / (8)); io++) { - float *vec = (float*) malloc(8 * sizeof(*vec)); - float *vec_1 = (float*) malloc(8 * sizeof(*vec_1)); - float *vec_2 = (float*) malloc(8 * sizeof(*vec_2)); - float *vec_3 = (float*) malloc(8 * sizeof(*vec_3)); - float *vec_4 = (float*) malloc(8 * sizeof(*vec_4)); - float *vec_5 = (float*) malloc(8 * sizeof(*vec_5)); - for (int_fast32_t ii = 0; ii < 8; ii++) { - vec_1[ii] = 2.0f; - } - for (int_fast32_t ii = 0; ii < 8; ii++) { - vec_2[ii] = a[8 * io + ii]; - } - for (int_fast32_t ii = 0; ii < 8; ii++) { - vec[ii] = vec_1[ii] * vec_2[ii]; - } - free(vec_2); - free(vec_1); - for (int_fast32_t ii = 0; ii < 8; ii++) { - vec_4[ii] = 3.0f; +void scaled_add_scheduled( + void *ctxt, int_fast32_t N, const float *a, const float *b, float *c) { + EXO_ASSUME(N % 8 == 0); + for (int_fast32_t io = 0; io < ((N) / (8)); io++) { + float *vec = (float *)malloc(8 * sizeof(*vec)); + float *vec_1 = (float *)malloc(8 * sizeof(*vec_1)); + float *vec_2 = (float *)malloc(8 * sizeof(*vec_2)); + float *vec_3 = (float *)malloc(8 * sizeof(*vec_3)); + float *vec_4 = (float *)malloc(8 * sizeof(*vec_4)); + float *vec_5 = (float *)malloc(8 * sizeof(*vec_5)); + for (int_fast32_t ii = 0; ii < 8; ii++) { + vec_1[ii] = 2.0f; + } + for (int_fast32_t ii = 0; ii < 8; ii++) { + vec_2[ii] = a[8 * io + ii]; + } + for (int_fast32_t ii = 0; ii < 8; ii++) { + vec[ii] = vec_1[ii] * vec_2[ii]; + } + free(vec_2); + free(vec_1); + for (int_fast32_t ii = 0; ii < 8; ii++) { + vec_4[ii] = 3.0f; + } + for (int_fast32_t ii = 0; ii < 8; ii++) { + vec_5[ii] = b[8 * io + ii]; + } + for (int_fast32_t ii = 0; ii < 8; ii++) { + vec_3[ii] = vec_4[ii] * vec_5[ii]; + } + free(vec_5); + free(vec_4); + for (int_fast32_t ii = 0; ii < 8; ii++) { + c[8 * io + ii] = vec[ii] + vec_3[ii]; + } + free(vec_3); + free(vec); } - for (int_fast32_t ii = 0; ii < 8; ii++) { - vec_5[ii] = b[8 * io + ii]; - } - for (int_fast32_t ii = 0; ii < 8; ii++) { - vec_3[ii] = vec_4[ii] * vec_5[ii]; - } - free(vec_5); - free(vec_4); - for (int_fast32_t ii = 0; ii < 8; ii++) { - c[8 * io + ii] = vec[ii] + vec_3[ii]; - } - free(vec_3); - free(vec); } -} - diff --git a/examples/quiz2/quiz2/quiz2.h b/examples/quiz2/quiz2/quiz2.h index e7c89722..27a55b25 100644 --- a/examples/quiz2/quiz2/quiz2.h +++ b/examples/quiz2/quiz2/quiz2.h @@ -7,37 +7,34 @@ extern "C" { #endif - -#include #include +#include // Compiler feature macros adapted from Hedley (public domain) // https://github.com/nemequ/hedley #if defined(__has_builtin) -# define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) +#define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) #else -# define EXO_HAS_BUILTIN(builtin) (0) +#define EXO_HAS_BUILTIN(builtin) (0) #endif #if EXO_HAS_BUILTIN(__builtin_assume) -# define EXO_ASSUME(expr) __builtin_assume(expr) +#define EXO_ASSUME(expr) __builtin_assume(expr) #elif EXO_HAS_BUILTIN(__builtin_unreachable) -# define EXO_ASSUME(expr) \ - ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) +#define EXO_ASSUME(expr) ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) #else -# define EXO_ASSUME(expr) ((void)(expr)) +#define EXO_ASSUME(expr) ((void)(expr)) #endif - - // scaled_add( // N : size, // a : f32[N] @DRAM, // b : f32[N] @DRAM, // c : f32[N] @DRAM // ) -void scaled_add( void *ctxt, int_fast32_t N, const float* a, const float* b, float* c ); +void scaled_add( + void *ctxt, int_fast32_t N, const float *a, const float *b, float *c); // scaled_add_scheduled( // N : size, @@ -45,9 +42,8 @@ void scaled_add( void *ctxt, int_fast32_t N, const float* a, const float* b, flo // b : f32[N] @DRAM, // c : f32[N] @DRAM // ) -void scaled_add_scheduled( void *ctxt, int_fast32_t N, const float* a, const float* b, float* c ); - - +void scaled_add_scheduled( + void *ctxt, int_fast32_t N, const float *a, const float *b, float *c); #ifdef __cplusplus }