/* ----------------------------------------------------------------------------- * * Kernel : Stencil2 * Copyright : [2008..2011] Manuel M T Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell * License : BSD3 * * Maintainer : Trevor L. McDonell * Stability : experimental * * Apply the function to each element of the array that takes a neighborhood of * elements from two input arrays as its input. Each thread processes multiple * elements, striding the array by the grid size. To improve performance, both * input arrays are bound as texture references so that reads are cached. * * ---------------------------------------------------------------------------*/ #include #if !defined(BOUNDARY_CLAMP_1) && !defined(BOUNDARY_MIRROR_1) && !defined(BOUNDARY_WRAP_1) /* * Bounds check handling for Constant. */ static __inline__ __device__ TyIn1 get1_for_constant_bounds(const DIM1 sh, const DIM1 ix) { if (ix < 0) return boundary_const1(); else if (ix >= sh) return boundary_const1(); else return tex_get1(ix); } static __inline__ __device__ TyIn1 get1_for_constant_bounds(const DIM2 sh, const DIM2 ix) { if (ix.a1 < 0 || ix.a0 < 0) return boundary_const1(); else if (ix.a1 >= sh.a1 || ix.a0 >= sh.a0) return boundary_const1(); else return tex_get1(toIndex(sh, ix)); } static __inline__ __device__ TyIn1 get1_for_constant_bounds(const DIM3 sh, const DIM3 ix) { if (ix.a2 < 0 || ix.a1 < 0 || ix.a0 < 0) return boundary_const1(); else if (ix.a2 >= sh.a2 || ix.a1 >= sh.a1 || ix.a0 >= sh.a0) return boundary_const1(); else return tex_get1(toIndex(sh, ix)); } static __inline__ __device__ TyIn1 get1_for_constant_bounds(const DIM4 sh, const DIM4 ix) { if (ix.a3 < 0 || ix.a2 < 0 || ix.a1 < 0 || ix.a0 < 0) return boundary_const1(); else if (ix.a3 >= sh.a3 || ix.a2 >= sh.a2 || ix.a1 >= sh.a1 || ix.a0 >= sh.a0) return boundary_const1(); else return tex_get1(toIndex(sh, ix)); } static __inline__ __device__ TyIn1 get1_for_constant_bounds(const DIM5 sh, const DIM5 ix) { if (ix.a4 < 0 || ix.a3 < 0 || ix.a2 < 0 || ix.a1 < 0 || ix.a0 < 0) return boundary_const1(); else if (ix.a4 >= sh.a4 || ix.a3 >= sh.a3 || ix.a2 >= sh.a2 || ix.a1 >= sh.a1 || ix.a0 >= sh.a0) return boundary_const1(); else return tex_get1(toIndex(sh, ix)); } static __inline__ __device__ TyIn1 get1_for_constant_bounds(const DIM6 sh, const DIM6 ix) { if (ix.a5 < 0 || ix.a4 < 0 || ix.a3 < 0 || ix.a2 < 0 || ix.a1 < 0 || ix.a0 < 0) return boundary_const1(); else if (ix.a5 >= sh.a5 || ix.a4 >= sh.a4 || ix.a3 >= sh.a3 || ix.a2 >= sh.a2 || ix.a1 >= sh.a1 || ix.a0 >= sh.a0) return boundary_const1(); else return tex_get1(toIndex(sh, ix)); } static __inline__ __device__ TyIn1 get1_for_constant_bounds(const DIM7 sh, const DIM7 ix) { if (ix.a6 < 0 || ix.a5 < 0 || ix.a4 < 0 || ix.a3 < 0 || ix.a2 < 0 || ix.a1 < 0 || ix.a0 < 0) return boundary_const1(); else if (ix.a6 >= sh.a6 || ix.a5 >= sh.a5 || ix.a4 >= sh.a4 || ix.a3 >= sh.a3 || ix.a2 >= sh.a2 || ix.a1 >= sh.a1 || ix.a0 >= sh.a0) return boundary_const1(); else return tex_get1(toIndex(sh, ix)); } static __inline__ __device__ TyIn1 get1_for_constant_bounds(const DIM8 sh, const DIM8 ix) { if (ix.a7 < 0 || ix.a6 < 0 || ix.a5 < 0 || ix.a4 < 0 || ix.a3 < 0 || ix.a2 < 0 || ix.a1 < 0 || ix.a0 < 0) return boundary_const1(); else if (ix.a7 >= sh.a7 || ix.a6 >= sh.a6 || ix.a5 >= sh.a5 || ix.a4 >= sh.a4 || ix.a3 >= sh.a3 || ix.a2 >= sh.a2 || ix.a1 >= sh.a1 || ix.a0 >= sh.a0) return boundary_const1(); else return tex_get1(toIndex(sh, ix)); } static __inline__ __device__ TyIn1 get1_for_constant_bounds(const DIM9 sh, const DIM9 ix) { if (ix.a8 < 0 || ix.a7 < 0 || ix.a6 < 0 || ix.a5 < 0 || ix.a4 < 0 || ix.a3 < 0 || ix.a2 < 0 || ix.a1 < 0 || ix.a0 < 0) return boundary_const1(); else if (ix.a8 >= sh.a8 || ix.a7 >= sh.a7 || ix.a6 >= sh.a6 || ix.a5 >= sh.a5 || ix.a4 >= sh.a4 || ix.a3 >= sh.a3 || ix.a2 >= sh.a2 || ix.a1 >= sh.a1 || ix.a0 >= sh.a0) return boundary_const1(); else return tex_get1(toIndex(sh, ix)); } #endif /* * Getter function that handles indexing outside the array boundary (array 1). */ static inline __attribute__((device)) TyIn1 get1_for_stencil(DimIn1 sh, DimIn1 ix) { #if defined(BOUNDARY_CLAMP_1) || defined(BOUNDARY_MIRROR_1) || defined(BOUNDARY_WRAP_1) return tex_get1(toIndex(sh, project_for_bounds(sh, ix))); #else return get1_for_constant_bounds(sh, ix); #endif } /* * The kernel. */ extern "C" __global__ void stencil2 ( ArrOut d_out, const DimOut d_out_shape, const DimIn0 d_in0_shape, const DimIn0 d_in1_shape ) { Ix idx; const Ix gridSize = __umul24(blockDim.x, gridDim.x); for (idx = __umul24(blockDim.x, blockIdx.x) + threadIdx.x; idx < size(d_out_shape); idx += gridSize) { set(d_out, idx, gather_and_apply(d_in0_shape, fromIndex(d_in0_shape, idx), d_in1_shape, fromIndex(d_in1_shape, idx))); } }