祭りだそうで
->無駄に並列化してみる | ||
->TBBで遊んでみたよ(12) | ||
->並列化?ハナアルキしてやんよ! |
-
-
-
-
- >いまここ
-
-
-
今のご時世CPUで並列化とか...
ってことで書いたんだけどレジスタが11も使われてて悲しい・・・
あと1つ減らせたら理論値で実行効率を1.5倍にできるのにorz
#include <cuda.h> template < unsigned int range_max > __global__ void __launch_bounds__( 256, 0 ) count_kernel( int * __restrict__ dest ) { int num = blockDim.x * blockIdx.x + threadIdx.x; extern __shared__ int _cnt[]; int &cnt = _cnt[ threadIdx.x ]; cnt = 0; if ( range_max < num ) { return; } do { cnt += !( num % 10 ); } while ( num /= 10 ); for ( int calc_threads = blockDim.x >> 1; warpSize <= calc_threads; calc_threads >>= 1 ) { if ( calc_threads <= threadIdx.x ) { return; } __syncthreads(); cnt += _cnt[ threadIdx.x + calc_threads ]; } __syncthreads(); dest[ blockIdx.x * warpSize + threadIdx.x ] = cnt; } #include <valarray> #include <iostream> namespace { const int range_max = 1000; const int blockSize = 256; const int gridSize = ( range_max + blockSize - 1 ) / blockSize; } int main( void ) { const int buffer_size = sizeof( int ) * gridSize * 32; int *dbuffer = NULL; cudaMalloc( &dbuffer, buffer_size ); count_kernel< range_max > <<< gridSize, blockSize, blockSize * sizeof( int ) >>>( dbuffer ); cudaThreadSynchronize(); int *hbuffer = new int [ buffer_size ]; cudaMemcpy( hbuffer, dbuffer, buffer_size, cudaMemcpyDeviceToHost ); cudaFree( dbuffer ); std::valarray< int > vala( hbuffer, buffer_size ); std::cout << vala.sum() << std::endl; delete hbuffer; return 0; }
んーやっぱGPUで総和をやらせるのはむずいな