in 1 of projects, i'm seeing incorrect results when using cub's devicereduce::reducebykey. however, using same inputs/outputs thrust::reduce_by_key produces expected results.
#include "cub/cub.cuh" #include <vector> #include <iostream> #include <cuda.h> struct addfunctor { __host__ __device__ __forceinline__ float operator()(const float & a, const float & b) const { return + b; } } reduction_op; int main() { int n = 7680; std::vector < uint64_t > keys_h(n); (int = 0; < 4000; i++) keys_h[i] = 1; (int = 4000; < 5000; i++) keys_h[i] = 2; (int = 5000; < 7680; i++) keys_h[i] = 3; uint64_t * keys; cudamalloc(&keys, sizeof(uint64_t) * n); cudamemcpy(keys, &keys_h[0], sizeof(uint64_t) * n, cudamemcpydefault); uint64_t * unique_keys; cudamalloc(&unique_keys, sizeof(uint64_t) * n); std::vector < float > values_h(n); (int = 0; < n; i++) values_h[i] = 1.0; float * values; cudamalloc(&values, sizeof(float) * n); cudamemcpy(values, &values_h[0], sizeof(float) * n, cudamemcpydefault); float * aggregates; cudamalloc(&aggregates, sizeof(float) * n); int * remaining; cudamalloc(&remaining, sizeof(int)); size_t size = 0; void * buffer = null; cub::devicereduce::reducebykey( buffer, size, keys, unique_keys, values, aggregates, remaining, reduction_op, n); cudamalloc(&buffer, sizeof(char) * size); cub::devicereduce::reducebykey( buffer, size, keys, unique_keys, values, aggregates, remaining, reduction_op, n); int remaining_h; cudamemcpy(&remaining_h, remaining, sizeof(int), cudamemcpydefault); std::vector < float > aggregates_h(remaining_h); cudamemcpy(&aggregates_h[0], aggregates, sizeof(float) * remaining_h, cudamemcpydefault); (int = 0; < remaining_h; i++) { std::cout << << ", " << aggregates_h[i] << std::endl; } cudafree(buffer); cudafree(keys); cudafree(unique_keys); cudafree(values); cudafree(aggregates); cudafree(remaining); }
when include "-gencode arch=compute_35,code=sm_35" (for kepler gtx titan), produces wrong results, when leave these flags out entirely, works.
$ nvcc cub_test.cu $ ./a.out 0, 4000 1, 1000 2, 2680 $ nvcc cub_test.cu -gencode arch=compute_35,code=sm_35 $ ./a.out 0, 4000 1, 1000 2, 768
i use handful of other cub calls without issue, 1 misbehaving. i've tried running code on gtx 1080 ti (with compute_61, sm_61) , see same behavior.
is right solution omit these compiler flags?
tried on 1 machine with:
- cuda 8.0
- ubuntu 16.04
- gcc 5.4.0
- cub 1.6.4
- kepler gtx titan (compute capability 3.5)
and with:
- cuda 8.0
- ubuntu 16.04
- gcc 5.4.0
- cub 1.6.4
- pascal gtx 1080 ti (compute capability 6.1)
sounds should file bug report @ cub repository issues page.
edit: can reproduce issue:
[joeuser@myhost:/tmp]$ nvcc -i/opt/cub -o a.cu nvcc warning : 'compute_20', 'sm_20', , 'sm_21' architectures deprecated, , may removed in future release (use -wno-deprecated-gpu-targets suppress warning). [joeuser@myhost:/tmp]$ ./a 0, 4000 1, 1000 2, 2680 [joeuser@myhost:/tmp]$ nvcc -i/opt/cub -o a.cu -gencode arch=compute_30,code=sm_30 [joeuser@myhost:/tmp]$ ./a 0, 4000 1, 1000 2, 512
relevant info:
- cuda: 8.0.61
- nvidia driver: 375.39
- distribution: gnu/linux mint 18.1
- linux kernel: 4.4.0
- gcc: 5.4.0-6ubuntu1~16.04.4
- cub: 1.6.4
- gpu: gtx 650 ti (compute capability 3.0)
Comments
Post a Comment