forked from NVIDIA/cuda-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
shfl_scan.cu
418 lines (334 loc) · 14.1 KB
/
shfl_scan.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Shuffle intrinsics CUDA Sample
// This sample demonstrates the use of the shuffle intrinsic
// First, a simple example of a prefix sum using the shuffle to
// perform a scan operation is provided.
// Secondly, a more involved example of computing an integral image
// using the shuffle intrinsic is provided, where the shuffle
// scan operation and shuffle xor operations are used
#include <stdio.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <helper_functions.h>
#include "shfl_integral_image.cuh"
// Scan using shfl - takes log2(n) steps
// This function demonstrates basic use of the shuffle intrinsic, __shfl_up,
// to perform a scan operation across a block.
// First, it performs a scan (prefix sum in this case) inside a warp
// Then to continue the scan operation across the block,
// each warp's sum is placed into shared memory. A single warp
// then performs a shuffle scan on that shared memory. The results
// are then uniformly added to each warp's threads.
// This pyramid type approach is continued by placing each block's
// final sum in global memory and prefix summing that via another kernel call,
// then uniformly adding across the input data via the uniform_add<<<>>> kernel.
__global__ void shfl_scan_test(int *data, int width, int *partial_sums = NULL) {
extern __shared__ int sums[];
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
int lane_id = id % warpSize;
// determine a warp_id within a block
int warp_id = threadIdx.x / warpSize;
// Below is the basic structure of using a shfl instruction
// for a scan.
// Record "value" as a variable - we accumulate it along the way
int value = data[id];
// Now accumulate in log steps up the chain
// compute sums, with another thread's value who is
// distance delta away (i). Note
// those threads where the thread 'i' away would have
// been out of bounds of the warp are unaffected. This
// creates the scan sum.
#pragma unroll
for (int i = 1; i <= width; i *= 2) {
unsigned int mask = 0xffffffff;
int n = __shfl_up_sync(mask, value, i, width);
if (lane_id >= i) value += n;
}
// value now holds the scan value for the individual thread
// next sum the largest values for each warp
// write the sum of the warp to smem
if (threadIdx.x % warpSize == warpSize - 1) {
sums[warp_id] = value;
}
__syncthreads();
//
// scan sum the warp sums
// the same shfl scan operation, but performed on warp sums
//
if (warp_id == 0 && lane_id < (blockDim.x / warpSize)) {
int warp_sum = sums[lane_id];
int mask = (1 << (blockDim.x / warpSize)) - 1;
for (int i = 1; i <= (blockDim.x / warpSize); i *= 2) {
int n = __shfl_up_sync(mask, warp_sum, i, (blockDim.x / warpSize));
if (lane_id >= i) warp_sum += n;
}
sums[lane_id] = warp_sum;
}
__syncthreads();
// perform a uniform add across warps in the block
// read neighbouring warp's sum and add it to threads value
int blockSum = 0;
if (warp_id > 0) {
blockSum = sums[warp_id - 1];
}
value += blockSum;
// Now write out our result
data[id] = value;
// last thread has sum, write write out the block's sum
if (partial_sums != NULL && threadIdx.x == blockDim.x - 1) {
partial_sums[blockIdx.x] = value;
}
}
// Uniform add: add partial sums array
__global__ void uniform_add(int *data, int *partial_sums, int len) {
__shared__ int buf;
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
if (id > len) return;
if (threadIdx.x == 0) {
buf = partial_sums[blockIdx.x];
}
__syncthreads();
data[id] += buf;
}
static unsigned int iDivUp(unsigned int dividend, unsigned int divisor) {
return ((dividend % divisor) == 0) ? (dividend / divisor)
: (dividend / divisor + 1);
}
// This function verifies the shuffle scan result, for the simple
// prefix sum case.
bool CPUverify(int *h_data, int *h_result, int n_elements) {
// cpu verify
for (int i = 0; i < n_elements - 1; i++) {
h_data[i + 1] = h_data[i] + h_data[i + 1];
}
int diff = 0;
for (int i = 0; i < n_elements; i++) {
diff += h_data[i] - h_result[i];
}
printf("CPU verify result diff (GPUvsCPU) = %d\n", diff);
bool bTestResult = false;
if (diff == 0) bTestResult = true;
StopWatchInterface *hTimer = NULL;
sdkCreateTimer(&hTimer);
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
for (int j = 0; j < 100; j++)
for (int i = 0; i < n_elements - 1; i++) {
h_data[i + 1] = h_data[i] + h_data[i + 1];
}
sdkStopTimer(&hTimer);
double cput = sdkGetTimerValue(&hTimer);
printf("CPU sum (naive) took %f ms\n", cput / 100);
return bTestResult;
}
// this verifies the row scan result for synthetic data of all 1's
unsigned int verifyDataRowSums(unsigned int *h_image, int w, int h) {
unsigned int diff = 0;
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
int gold = i + 1;
diff +=
abs(static_cast<int>(gold) - static_cast<int>(h_image[j * w + i]));
}
}
return diff;
}
bool shuffle_simple_test(int argc, char **argv) {
int *h_data, *h_partial_sums, *h_result;
int *d_data, *d_partial_sums;
const int n_elements = 65536;
int sz = sizeof(int) * n_elements;
int cuda_device = 0;
printf("Starting shfl_scan\n");
// use command-line specified CUDA device, otherwise use device with highest
// Gflops/s
cuda_device = findCudaDevice(argc, (const char **)argv);
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDevice(&cuda_device));
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
// __shfl intrinsic needs SM 3.0 or higher
if (deviceProp.major < 3) {
printf("> __shfl() intrinsic requires device SM 3.0+\n");
printf("> Waiving test.\n");
exit(EXIT_WAIVED);
}
checkCudaErrors(cudaMallocHost(reinterpret_cast<void **>(&h_data),
sizeof(int) * n_elements));
checkCudaErrors(cudaMallocHost(reinterpret_cast<void **>(&h_result),
sizeof(int) * n_elements));
// initialize data:
printf("Computing Simple Sum test\n");
printf("---------------------------------------------------\n");
printf("Initialize test data [1, 1, 1...]\n");
for (int i = 0; i < n_elements; i++) {
h_data[i] = 1;
}
int blockSize = 256;
int gridSize = n_elements / blockSize;
int nWarps = blockSize / 32;
int shmem_sz = nWarps * sizeof(int);
int n_partialSums = n_elements / blockSize;
int partial_sz = n_partialSums * sizeof(int);
printf("Scan summation for %d elements, %d partial sums\n", n_elements,
n_elements / blockSize);
int p_blockSize = min(n_partialSums, blockSize);
int p_gridSize = iDivUp(n_partialSums, p_blockSize);
printf("Partial summing %d elements with %d blocks of size %d\n",
n_partialSums, p_gridSize, p_blockSize);
// initialize a timer
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
float et = 0;
float inc = 0;
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_data), sz));
checkCudaErrors(
cudaMalloc(reinterpret_cast<void **>(&d_partial_sums), partial_sz));
checkCudaErrors(cudaMemset(d_partial_sums, 0, partial_sz));
checkCudaErrors(
cudaMallocHost(reinterpret_cast<void **>(&h_partial_sums), partial_sz));
checkCudaErrors(cudaMemcpy(d_data, h_data, sz, cudaMemcpyHostToDevice));
checkCudaErrors(cudaEventRecord(start, 0));
shfl_scan_test<<<gridSize, blockSize, shmem_sz>>>(d_data, 32, d_partial_sums);
shfl_scan_test<<<p_gridSize, p_blockSize, shmem_sz>>>(d_partial_sums, 32);
uniform_add<<<gridSize - 1, blockSize>>>(d_data + blockSize, d_partial_sums,
n_elements);
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&inc, start, stop));
et += inc;
checkCudaErrors(cudaMemcpy(h_result, d_data, sz, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_partial_sums, d_partial_sums, partial_sz,
cudaMemcpyDeviceToHost));
printf("Test Sum: %d\n", h_partial_sums[n_partialSums - 1]);
printf("Time (ms): %f\n", et);
printf("%d elements scanned in %f ms -> %f MegaElements/s\n", n_elements, et,
n_elements / (et / 1000.0f) / 1000000.0f);
bool bTestResult = CPUverify(h_data, h_result, n_elements);
checkCudaErrors(cudaFreeHost(h_data));
checkCudaErrors(cudaFreeHost(h_result));
checkCudaErrors(cudaFreeHost(h_partial_sums));
checkCudaErrors(cudaFree(d_data));
checkCudaErrors(cudaFree(d_partial_sums));
return bTestResult;
}
// This function tests creation of an integral image using
// synthetic data, of size 1920x1080 pixels greyscale.
bool shuffle_integral_image_test() {
char *d_data;
unsigned int *h_image;
unsigned int *d_integral_image;
int w = 1920;
int h = 1080;
int n_elements = w * h;
int sz = sizeof(unsigned int) * n_elements;
printf("\nComputing Integral Image Test on size %d x %d synthetic data\n", w,
h);
printf("---------------------------------------------------\n");
checkCudaErrors(cudaMallocHost(reinterpret_cast<void **>(&h_image), sz));
// fill test "image" with synthetic 1's data
memset(h_image, 0, sz);
// each thread handles 16 values, use 1 block/row
int blockSize = iDivUp(w, 16);
// launch 1 block / row
int gridSize = h;
// Create a synthetic image for testing
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_data), sz));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_integral_image),
n_elements * sizeof(int) * 4));
checkCudaErrors(cudaMemset(d_data, 1, sz));
checkCudaErrors(cudaMemset(d_integral_image, 0, sz));
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float et = 0;
unsigned int err;
// Execute scan line prefix sum kernel, and time it
cudaEventRecord(start);
shfl_intimage_rows<<<gridSize, blockSize>>>(
reinterpret_cast<uint4 *>(d_data),
reinterpret_cast<uint4 *>(d_integral_image));
cudaEventRecord(stop);
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&et, start, stop));
printf("Method: Fast Time (GPU Timer): %f ms ", et);
// verify the scan line results
checkCudaErrors(
cudaMemcpy(h_image, d_integral_image, sz, cudaMemcpyDeviceToHost));
err = verifyDataRowSums(h_image, w, h);
printf("Diff = %d\n", err);
// Execute column prefix sum kernel and time it
dim3 blockSz(32, 8);
dim3 testGrid(w / blockSz.x, 1);
cudaEventRecord(start);
shfl_vertical_shfl<<<testGrid, blockSz>>>((unsigned int *)d_integral_image, w,
h);
cudaEventRecord(stop);
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&et, start, stop));
printf("Method: Vertical Scan Time (GPU Timer): %f ms ", et);
// Verify the column results
checkCudaErrors(
cudaMemcpy(h_image, d_integral_image, sz, cudaMemcpyDeviceToHost));
printf("\n");
int finalSum = h_image[w * h - 1];
printf("CheckSum: %d, (expect %dx%d=%d)\n", finalSum, w, h, w * h);
checkCudaErrors(cudaFree(d_data));
checkCudaErrors(cudaFree(d_integral_image));
checkCudaErrors(cudaFreeHost(h_image));
// verify final sum: if the final value in the corner is the same as the size
// of the buffer (all 1's) then the integral image was generated successfully
return (finalSum == w * h) ? true : false;
}
int main(int argc, char *argv[]) {
// Initialization. The shuffle intrinsic is not available on SM < 3.0
// so waive the test if the hardware is not present.
int cuda_device = 0;
printf("Starting shfl_scan\n");
// use command-line specified CUDA device, otherwise use device with highest
// Gflops/s
cuda_device = findCudaDevice(argc, (const char **)argv);
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDevice(&cuda_device));
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
// __shfl intrinsic needs SM 3.0 or higher
if (deviceProp.major < 3) {
printf("> __shfl() intrinsic requires device SM 3.0+\n");
printf("> Waiving test.\n");
exit(EXIT_WAIVED);
}
bool bTestResult = true;
bool simpleTest = shuffle_simple_test(argc, argv);
bool intTest = shuffle_integral_image_test();
bTestResult = simpleTest & intTest;
exit((bTestResult) ? EXIT_SUCCESS : EXIT_FAILURE);
}