__global__ void filter(float* u,float* tmp,int n) { int i = blockIdx.x*blockDim.x+threadIdx.x; int i0 = (i-1+n)%n; int i1 = (i+1)%n; tmp[i] = 0.25f*(u[i0]+2.0f*u[i]+u[i1]); } void gpu_run_cuda(float* u,float* tmp,int n) { extern unsigned long bsize; dim3 bsz(bsize,1,1); dim3 gsz(n/bsize,1); filter<<>>(u,tmp,n); cudaThreadSynchronize(); }