Please, help us to better know about our user community by answering the following short survey: https://forms.gle/wpyrxWi18ox9Z5ae9
TensorReductionGpu.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
12 
13 namespace Eigen {
14 namespace internal {
15 
16 
17 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
18 // Full reducers for GPU, don't vectorize for now
19 
20 // Reducer function that enables multiple gpu thread to safely accumulate at the same
21 // output address. It basically reads the current value of the output variable, and
22 // attempts to update it with the new value. If in the meantime another gpu thread
23 // updated the content of the output address it will try again.
24 template <typename T, typename R>
25 __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
26 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
27  if (sizeof(T) == 4)
28  {
29  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
30  unsigned int newval = oldval;
31  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
32  if (newval == oldval) {
33  return;
34  }
35  unsigned int readback;
36  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
37  oldval = readback;
38  newval = oldval;
39  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
40  if (newval == oldval) {
41  return;
42  }
43  }
44  }
45  else if (sizeof(T) == 8) {
46  unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
47  unsigned long long newval = oldval;
48  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
49  if (newval == oldval) {
50  return;
51  }
52  unsigned long long readback;
53  while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
54  oldval = readback;
55  newval = oldval;
56  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
57  if (newval == oldval) {
58  return;
59  }
60  }
61  }
62  else {
63  gpu_assert(0 && "Wordsize not supported");
64  }
65 #else // EIGEN_CUDA_ARCH >= 300
66  gpu_assert(0 && "Shouldn't be called on unsupported device");
67 #endif // EIGEN_CUDA_ARCH >= 300
68 }
69 
70 // We extend atomicExch to support extra data types
71 template <typename Type>
72 __device__ inline Type atomicExchCustom(Type* address, Type val) {
73  return atomicExch(address, val);
74 }
75 
76 template <>
77 __device__ inline double atomicExchCustom(double* address, double val) {
78  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
79  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
80 }
81 
82 #ifdef EIGEN_HAS_GPU_FP16
83 template <typename R>
84 __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
85  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
86  unsigned int newval = oldval;
87  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
88  if (newval == oldval) {
89  return;
90  }
91  unsigned int readback;
92  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
93  oldval = readback;
94  newval = oldval;
95  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
96  if (newval == oldval) {
97  return;
98  }
99  }
100 }
101 // reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
102 template <typename R>
103 __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
104  half2* houtput=reinterpret_cast<half2*>(output);
105  half2* haccum=reinterpret_cast<half2*>(&accum);
106  for(int i=0;i<4;++i){
107  atomicReduce(houtput+i,*(haccum+i),reducer);
108  }
109 }
110 #endif // EIGEN_HAS_GPU_FP16
111 
112 template <>
113 __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
114 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
115  atomicAdd(output, accum);
116 #else // EIGEN_CUDA_ARCH >= 300
117  gpu_assert(0 && "Shouldn't be called on unsupported device");
118 #endif // EIGEN_CUDA_ARCH >= 300
119 }
120 
121 
122 template <typename CoeffType, typename Index>
123 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
124  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
125  const Index num_threads = blockDim.x * gridDim.x;
126  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
127  output[i] = val;
128  }
129 }
130 
131 
132 template <int BlockSize, int NumPerThread, typename Self,
133  typename Reducer, typename Index>
134 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
135  typename Self::CoeffReturnType* output, unsigned int* semaphore) {
136 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
137  // Initialize the output value
138  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
139  if (gridDim.x == 1) {
140  if (first_index == 0) {
141  *output = reducer.initialize();
142  }
143  }
144  else {
145  if (threadIdx.x == 0) {
146  unsigned int block = atomicCAS(semaphore, 0u, 1u);
147  if (block == 0) {
148  // We're the first block to run, initialize the output value
149  atomicExchCustom(output, reducer.initialize());
150  __threadfence();
151  atomicExch(semaphore, 2u);
152  }
153  else {
154  // Wait for the first block to initialize the output value.
155  // Use atomicCAS here to ensure that the reads aren't cached
156  unsigned int val;
157  do {
158  val = atomicCAS(semaphore, 2u, 2u);
159  }
160  while (val < 2u);
161  }
162  }
163  }
164 
165  __syncthreads();
166 
167  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
168 
169  typename Self::CoeffReturnType accum = reducer.initialize();
170  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
171  for (Index i = 0; i < max_iter; i+=BlockSize) {
172  const Index index = first_index + i;
173  eigen_assert(index < num_coeffs);
174  typename Self::CoeffReturnType val = input.m_impl.coeff(index);
175  reducer.reduce(val, &accum);
176  }
177 
178 #pragma unroll
179  for (int offset = warpSize/2; offset > 0; offset /= 2) {
180  #if defined(EIGEN_HIPCC)
181  // use std::is_floating_point to determine the type of reduced_val
182  // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
183  // and list the float and int versions of __shfl_down as the candidate functions.
184  if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
185  reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
186  } else {
187  reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
188  }
189  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
190  reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
191  #else
192  reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
193  #endif
194  }
195 
196  if ((threadIdx.x & (warpSize - 1)) == 0) {
197  atomicReduce(output, accum, reducer);
198  }
199 
200  if (gridDim.x > 1 && threadIdx.x == 0) {
201  // Let the last block reset the semaphore
202  atomicInc(semaphore, gridDim.x + 1);
203 #if defined(EIGEN_HIPCC)
204  __threadfence_system();
205 #endif
206  }
207 #else // EIGEN_CUDA_ARCH >= 300
208  gpu_assert(0 && "Shouldn't be called on unsupported device");
209 #endif // EIGEN_CUDA_ARCH >= 300
210 }
211 
212 
213 #ifdef EIGEN_HAS_GPU_FP16
214 template <typename Self,
215  typename Reducer, typename Index>
216 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
217  packet_traits<Eigen::half>::type* scratch) {
218  eigen_assert(blockDim.x == 1);
219  eigen_assert(gridDim.x == 1);
220  typedef packet_traits<Eigen::half>::type packet_type;
221  Index packet_remainder =
222  num_coeffs % Index(unpacket_traits<packet_type>::size);
223  if (packet_remainder != 0) {
224  half2* h2scratch = reinterpret_cast<half2*>(scratch);
225  for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
226  *h2scratch =
227  __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1));
228  h2scratch++;
229  }
230  if ((num_coeffs & 1) != 0) {
231  half lastCoeff = input.m_impl.coeff(num_coeffs - 1);
232  *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
233  }
234  } else {
235  *scratch = reducer.template initializePacket<packet_type>();
236  }
237 }
238 
239 template <typename Self,
240  typename Reducer, typename Index>
241 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
242  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
243  const Index num_threads = blockDim.x * gridDim.x;
244  typedef typename packet_traits<Eigen::half>::type PacketType;
245 
246  const Index num_packets =
247  num_coeffs / Index(unpacket_traits<PacketType>::size);
248  PacketType* p_output = reinterpret_cast<PacketType*>(output);
249  for (Index i = thread_id; i < num_packets; i += num_threads) {
250  p_output[i] = reducer.template initializePacket<PacketType>();
251  }
252  Index packet_remainder =
253  num_coeffs % Index(unpacket_traits<PacketType>::size);
254  if (thread_id < packet_remainder) {
255  output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
256  }
257 }
258 
259 template <int BlockSize, int NumPerThread, typename Self,
260  typename Reducer, typename Index>
261 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
262  half* output, packet_traits<Eigen::half>::type* scratch) {
263  typedef typename packet_traits<Eigen::half>::type PacketType;
264  const int packet_width = unpacket_traits<PacketType>::size;
265  eigen_assert(NumPerThread % packet_width == 0);
266  const Index first_index =
267  blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
268 
269  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
270 
271  if (gridDim.x == 1) {
272  if (first_index == 0) {
273  int rem = num_coeffs % packet_width;
274  if (rem != 0) {
275  half2* p_scratch = reinterpret_cast<half2*>(scratch);
276  *scratch = reducer.template initializePacket<PacketType>();
277  for (int i = 0; i < rem / 2; i++) {
278  *p_scratch = __halves2half2(
279  input.m_impl.coeff(num_coeffs - packet_width + 2 * i),
280  input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1));
281  p_scratch++;
282  }
283  if ((num_coeffs & 1) != 0) {
284  half last = input.m_impl.coeff(num_coeffs - 1);
285  *p_scratch = __halves2half2(last, reducer.initialize());
286  }
287  } else {
288  *scratch = reducer.template initializePacket<PacketType>();
289  }
290  }
291  __syncthreads();
292  }
293 
294  PacketType accum = reducer.template initializePacket<PacketType>();
295  const Index max_iter =
296  numext::mini<Index>((num_coeffs - first_index) / packet_width,
297  NumPerThread * BlockSize / packet_width);
298  for (Index i = 0; i < max_iter; i += BlockSize) {
299  const Index index = first_index + packet_width * i;
300  eigen_assert(index + packet_width < num_coeffs);
301  PacketType val = input.m_impl.template packet<Unaligned>(index);
302  reducer.reducePacket(val, &accum);
303  }
304 
305 #pragma unroll
306  for (int offset = warpSize/2; offset > 0; offset /= 2) {
307  #if defined(EIGEN_HIPCC)
308  PacketType r1;
309  half2* hr = reinterpret_cast<half2*>(&r1);
310  half2* hacc = reinterpret_cast<half2*>(&accum);
311  for (int i = 0; i < packet_width / 2; i++) {
312  // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
313  union { int i; half2 h; } wka_in, wka_out;
314  wka_in.h = hacc[i];
315  wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
316  hr[i] = wka_out.h;
317  }
318  reducer.reducePacket(r1, &accum);
319  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
320  PacketType r1;
321  half2* hr = reinterpret_cast<half2*>(&r1);
322  half2* hacc = reinterpret_cast<half2*>(&accum);
323  for (int i = 0; i < packet_width / 2; i++) {
324  hr[i] = __shfl_down(hacc[i], offset, warpSize);
325  }
326  reducer.reducePacket(r1, &accum);
327  #else
328  PacketType r1;
329  half2* hr = reinterpret_cast<half2*>(&r1);
330  half2* hacc = reinterpret_cast<half2*>(&accum);
331  for (int i = 0; i < packet_width / 2; i++) {
332  hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
333  }
334  reducer.reducePacket(r1, &accum);
335 
336  #endif
337  }
338 
339  if ((threadIdx.x & (warpSize - 1)) == 0) {
340  atomicReduce(scratch, accum, reducer);
341  }
342 
343  __syncthreads();
344  half2* rv1 = reinterpret_cast<half2*>(scratch);
345  if (packet_width > 2) {
346  reducer.reducePacket(rv1[2], rv1);
347  reducer.reducePacket(rv1[3], rv1 + 1);
348  reducer.reducePacket(rv1[1], rv1);
349  }
350  if (gridDim.x == 1) {
351  if (first_index == 0) {
352  half tmp = __low2half(*rv1);
353  reducer.reduce(__high2half(*rv1), &tmp);
354  *output = tmp;
355  }
356  }
357 }
358 
359 template <typename Op>
360 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
361  eigen_assert(threadIdx.x == 1);
362  half2* pscratch = reinterpret_cast<half2*>(scratch);
363  half tmp = __float2half(0.f);
364  typedef packet_traits<Eigen::half>::type packet_type;
365  for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
366  reducer.reduce(__low2half(*pscratch), &tmp);
367  reducer.reduce(__high2half(*pscratch), &tmp);
368  pscratch++;
369  }
370  *output = tmp;
371 }
372 
373 #endif // EIGEN_HAS_GPU_FP16
374 
375 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
376 struct FullReductionLauncher {
377  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
378  gpu_assert(false && "Should only be called on doubles, floats and half floats");
379  }
380 };
381 
382 // Specialization for float and double
383 template <typename Self, typename Op, typename OutputType, bool PacketAccess>
384 struct FullReductionLauncher<
385  Self, Op, OutputType, PacketAccess,
386  typename internal::enable_if<
387  internal::is_same<float, OutputType>::value ||
388  internal::is_same<double, OutputType>::value,
389  void>::type> {
390  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
391 
392  typedef typename Self::Index Index;
393  const int block_size = 256;
394  const int num_per_thread = 128;
395  const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
396 
397  unsigned int* semaphore = NULL;
398  if (num_blocks > 1) {
399  semaphore = device.semaphore();
400  }
401 
402  LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
403  num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
404  }
405 };
406 
407 #ifdef EIGEN_HAS_GPU_FP16
408 template <typename Self, typename Op>
409 struct FullReductionLauncher<Self, Op, Eigen::half, false> {
410  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
411  gpu_assert(false && "Should not be called since there is no packet accessor");
412  }
413 };
414 
415 template <typename Self, typename Op>
416 struct FullReductionLauncher<Self, Op, Eigen::half, true> {
417  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
418  typedef typename Self::Index Index;
419  typedef typename packet_traits<Eigen::half>::type PacketType;
420 
421  const int block_size = 256;
422  const int num_per_thread = 128;
423  const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
424  PacketType* scratch = static_cast<PacketType*>(device.scratchpad());
425  // half2* scratch = static_cast<half2*>(device.scratchpad());
426 
427  if (num_blocks > 1) {
428  // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
429  // won't be a race conditions between multiple thread blocks.
430  LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
431  1, 1, 0, device, reducer, self, num_coeffs, scratch);
432  }
433 
434  LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
435  num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
436 
437  if (num_blocks > 1) {
438  LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
439  1, 1, 0, device, reducer, output, scratch);
440  }
441  }
442 };
443 #endif // EIGEN_HAS_GPU_FP16
444 
445 
446 template <typename Self, typename Op, bool Vectorizable>
447 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
448  // Unfortunately nvidia doesn't support well exotic types such as complex,
449  // so reduce the scope of the optimized version of the code to the simple cases
450  // of doubles, floats and half floats
451 #ifdef EIGEN_HAS_GPU_FP16
452  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
453  (internal::is_same<typename Self::CoeffReturnType, float>::value ||
454  internal::is_same<typename Self::CoeffReturnType, double>::value ||
455  (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
456 #else // EIGEN_HAS_GPU_FP16
457  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
458  (internal::is_same<typename Self::CoeffReturnType, float>::value ||
459  internal::is_same<typename Self::CoeffReturnType, double>::value);
460 #endif // EIGEN_HAS_GPU_FP16
461 
462  template <typename OutputType>
463  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
464  gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
465  const Index num_coeffs = array_prod(self.m_impl.dimensions());
466  // Don't crash when we're called with an input tensor of size 0.
467  if (num_coeffs == 0) {
468  return;
469  }
470 
471  FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
472  }
473 };
474 
475 
476 template <int NumPerThread, typename Self,
477  typename Reducer, typename Index>
478 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
479  typename Self::CoeffReturnType* output) {
480 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
481  typedef typename Self::CoeffReturnType Type;
482  eigen_assert(blockDim.y == 1);
483  eigen_assert(blockDim.z == 1);
484  eigen_assert(gridDim.y == 1);
485  eigen_assert(gridDim.z == 1);
486 
487  const int unroll_times = 16;
488  eigen_assert(NumPerThread % unroll_times == 0);
489 
490  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
491  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
492 
493  const Index num_threads = blockDim.x * gridDim.x;
494  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
495 
496  // Initialize the output values if they weren't initialized by the ReductionInitKernel
497  if (gridDim.x == 1) {
498  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
499  output[i] = reducer.initialize();
500  }
501  __syncthreads();
502  }
503 
504  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
505  const Index row = i / input_col_blocks;
506 
507  if (row < num_preserved_coeffs) {
508  const Index col_block = i % input_col_blocks;
509  const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
510 
511  Type reduced_val = reducer.initialize();
512 
513  for (Index j = 0; j < NumPerThread; j += unroll_times) {
514  const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
515  if (last_col >= num_coeffs_to_reduce) {
516  for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
517  const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
518  reducer.reduce(val, &reduced_val);
519  }
520  break;
521  } else {
522  // Faster version of the loop with no branches after unrolling.
523 #pragma unroll
524  for (int k = 0; k < unroll_times; ++k) {
525  const Index col = col_begin + blockDim.x * (j + k);
526  reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
527  }
528  }
529  }
530 
531 #pragma unroll
532  for (int offset = warpSize/2; offset > 0; offset /= 2) {
533  #if defined(EIGEN_HIPCC)
534  // use std::is_floating_point to determine the type of reduced_val
535  // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
536  // and list the float and int versions of __shfl_down as the candidate functions.
537  if (std::is_floating_point<Type>::value) {
538  reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
539  } else {
540  reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
541  }
542  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
543  reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
544  #else
545  reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
546  #endif
547  }
548 
549  if ((threadIdx.x & (warpSize - 1)) == 0) {
550  atomicReduce(&(output[row]), reduced_val, reducer);
551  }
552  }
553  }
554 #else // EIGEN_CUDA_ARCH >= 300
555  gpu_assert(0 && "Shouldn't be called on unsupported device");
556 #endif // EIGEN_CUDA_ARCH >= 300
557 }
558 
559 #ifdef EIGEN_HAS_GPU_FP16
560 
561 template <int NumPerThread, typename Self,
562  typename Reducer, typename Index>
563 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
564  half* output) {
565  eigen_assert(blockDim.y == 1);
566  eigen_assert(blockDim.z == 1);
567  eigen_assert(gridDim.y == 1);
568  eigen_assert(gridDim.z == 1);
569 
570  typedef typename packet_traits<Eigen::half>::type PacketType;
571  const int packet_width = unpacket_traits<PacketType>::size;
572  const int unroll_times = 16 / packet_width;
573  eigen_assert(NumPerThread % unroll_times == 0);
574  eigen_assert(unroll_times % 2 == 0);
575 
576  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
577  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
578 
579  const Index num_threads = blockDim.x * gridDim.x;
580  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
581 
582  // Initialize the output values if they weren't initialized by the ReductionInitKernel
583  if (gridDim.x == 1) {
584  Index i = packet_width * thread_id;
585  for (; i + packet_width <= num_preserved_coeffs;
586  i += packet_width * num_threads) {
587  PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
588  *poutput = reducer.template initializePacket<PacketType>();
589  }
590  if (i < num_preserved_coeffs) {
591  output[i] = reducer.initialize();
592  }
593  __syncthreads();
594  }
595 
596  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
597  const Index row = 2 * (i / input_col_blocks); // everybody takes 2 rows
598 
599  if (row + 1 < num_preserved_coeffs) {
600  const Index col_block = i % input_col_blocks;
601  const Index col_begin =
602  packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
603 
604  PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
605  PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
606 
607  for (Index j = 0; j < NumPerThread; j += unroll_times) {
608  const Index last_col =
609  col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
610  if (last_col >= num_coeffs_to_reduce) {
611  Index col = col_begin + blockDim.x * j;
612  for (; col + packet_width <= num_coeffs_to_reduce;
613  col += blockDim.x) {
614  const PacketType val1 = input.m_impl.template packet<Unaligned>(
615  row * num_coeffs_to_reduce + col);
616  reducer.reducePacket(val1, &reduced_val1);
617  const PacketType val2 = input.m_impl.template packet<Unaligned>(
618  (row + 1) * num_coeffs_to_reduce + col);
619  reducer.reducePacket(val2, &reduced_val2);
620  }
621  if (col < num_coeffs_to_reduce) {
622  PacketType r1 = reducer.template initializePacket<PacketType>();
623  PacketType r2 = reducer.template initializePacket<PacketType>();
624  half2* hr1 = reinterpret_cast<half2*>(&r1);
625  half2* hr2 = reinterpret_cast<half2*>(&r2);
626  while (col + 1 < num_coeffs_to_reduce) {
627  *hr1 = __halves2half2(
628  input.m_impl.coeff(row * num_coeffs_to_reduce + col),
629  input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
630  *hr2 = __halves2half2(
631  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
632  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
633  1));
634  hr1++;
635  hr2++;
636  col += 2;
637  }
638  if (col < num_coeffs_to_reduce) {
639  // Peel;
640  const half last1 =
641  input.m_impl.coeff(row * num_coeffs_to_reduce + col);
642  *hr1 = __halves2half2(last1, reducer.initialize());
643  const half last2 =
644  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
645  *hr2 = __halves2half2(last2, reducer.initialize());
646  }
647  reducer.reducePacket(r1, &reduced_val1);
648  reducer.reducePacket(r2, &reduced_val2);
649  }
650  break;
651  } else {
652  // Faster version of the loop with no branches after unrolling.
653 #pragma unroll
654  for (int k = 0; k < unroll_times; ++k) {
655  const Index col = col_begin + blockDim.x * (j + k) * packet_width;
656  reducer.reducePacket(input.m_impl.template packet<Unaligned>(
657  row * num_coeffs_to_reduce + col),
658  &reduced_val1);
659  reducer.reducePacket(input.m_impl.template packet<Unaligned>(
660  (row + 1) * num_coeffs_to_reduce + col),
661  &reduced_val2);
662  }
663  }
664  }
665 
666 #pragma unroll
667  for (int offset = warpSize/2; offset > 0; offset /= 2) {
668  #if defined(EIGEN_HIPCC)
669  PacketType r1;
670  PacketType r2;
671  half2* hr1 = reinterpret_cast<half2*>(&r1);
672  half2* hr2 = reinterpret_cast<half2*>(&r2);
673  half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
674  half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
675  for (int i = 0; i < packet_width / 2; i++) {
676  // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
677  union { int i; half2 h; } wka_in1, wka_out1;
678  wka_in1.h = rv1[i];
679  wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
680  hr1[i] = wka_out1.h;
681 
682  union { int i; half2 h; } wka_in2, wka_out2;
683  wka_in2.h = rv2[i];
684  wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
685  hr2[i] = wka_out2.h;
686  }
687  reducer.reducePacket(r1, &reduced_val1);
688  reducer.reducePacket(r2, &reduced_val2);
689  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
690  PacketType r1;
691  PacketType r2;
692  half2* hr1 = reinterpret_cast<half2*>(&r1);
693  half2* hr2 = reinterpret_cast<half2*>(&r2);
694  half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
695  half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
696  for (int i = 0; i < packet_width / 2; i++) {
697  hr1[i] = __shfl_down(rv1[i], offset, warpSize);
698  hr2[i] = __shfl_down(rv2[i], offset, warpSize);
699  }
700  reducer.reducePacket(r1, &reduced_val1);
701  reducer.reducePacket(r2, &reduced_val2);
702  #else
703  PacketType r1;
704  PacketType r2;
705  half2* hr1 = reinterpret_cast<half2*>(&r1);
706  half2* hr2 = reinterpret_cast<half2*>(&r2);
707  half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
708  half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
709  for (int i = 0; i < packet_width / 2; i++) {
710  hr1[i] =
711  __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
712  hr2[i] =
713  __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
714  }
715  reducer.reducePacket(r1, &reduced_val1);
716  reducer.reducePacket(r2, &reduced_val2);
717 
718  #endif
719  }
720  half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
721  half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
722  half2 val;
723  if (packet_width > 2) {
724  reducer.reducePacket(rv1[2], rv1);
725  reducer.reducePacket(rv1[3], rv1 + 1);
726  reducer.reducePacket(rv1[1], rv1);
727  reducer.reducePacket(rv2[2], rv2);
728  reducer.reducePacket(rv2[3], rv2 + 1);
729  reducer.reducePacket(rv2[1], rv2);
730  }
731  half val1 = __low2half(*rv1);
732  reducer.reduce(__high2half(*rv1), &val1);
733  half val2 = __low2half(*rv2);
734  reducer.reduce(__high2half(*rv2), &val2);
735  val = __halves2half2(val1, val2);
736  if ((threadIdx.x & (warpSize - 1)) == 0) {
737  half* loc = output + row;
738  atomicReduce((half2*)loc, val, reducer);
739  }
740  }
741  }
742 }
743 
744 #endif // EIGEN_HAS_GPU_FP16
745 
746 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
747 struct InnerReductionLauncher {
748  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
749  gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
750  return true;
751  }
752 };
753 
754 // Specialization for float and double
755 template <typename Self, typename Op, typename OutputType, bool PacketAccess>
756 struct InnerReductionLauncher<
757  Self, Op, OutputType, PacketAccess,
758  typename internal::enable_if<
759  internal::is_same<float, OutputType>::value ||
760  internal::is_same<double, OutputType>::value,
761  void>::type> {
762  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
763  typedef typename Self::Index Index;
764 
765  const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
766  const int block_size = 256;
767  const int num_per_thread = 128;
768  const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
769  const int max_blocks = device.getNumGpuMultiProcessors() *
770  device.maxGpuThreadsPerMultiProcessor() / block_size;
771  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
772 
773  if (num_blocks > 1) {
774  // We initialize the outputs outside the reduction kernel when we can't be sure that there
775  // won't be a race conditions between multiple thread blocks.
776  const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
777  const int max_blocks = device.getNumGpuMultiProcessors() *
778  device.maxGpuThreadsPerMultiProcessor() / 1024;
779  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
780  LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
781  num_blocks, 1024, 0, device, reducer.initialize(),
782  num_preserved_vals, output);
783  }
784 
785  LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
786  num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
787 
788  return false;
789  }
790 };
791 
792 #ifdef EIGEN_HAS_GPU_FP16
793 template <typename Self, typename Op>
794 struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
795  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
796  gpu_assert(false && "Should not be called since there is no packet accessor");
797  return true;
798  }
799 };
800 
801 template <typename Self, typename Op>
802 struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
803  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
804  typedef typename Self::Index Index;
805 
806  if (num_preserved_vals % 2 != 0) {
807  // Not supported yet, revert to the slower code path
808  return true;
809  }
810 
811  const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
812  const int block_size = /*256*/128;
813  const int num_per_thread = /*128*/64;
814  const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
815  const int max_blocks = device.getNumGpuMultiProcessors() *
816  device.maxGpuThreadsPerMultiProcessor() / block_size;
817  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
818 
819  if (num_blocks > 1) {
820  // We initialize the outputs outside the reduction kernel when we can't be sure that there
821  // won't be a race conditions between multiple thread blocks.
822  LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
823  1, 1, 0, device, reducer, self, num_preserved_vals, output);
824  }
825 
826  LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
827  num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
828 
829  return false;
830  }
831 };
832 #endif // EIGEN_HAS_GPU_FP16
833 
834 
835 template <typename Self, typename Op>
836 struct InnerReducer<Self, Op, GpuDevice> {
837  // Unfortunately nvidia doesn't support well exotic types such as complex,
838  // so reduce the scope of the optimized version of the code to the simple case
839  // of floats and half floats.
840 #ifdef EIGEN_HAS_GPU_FP16
841  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
842  (internal::is_same<typename Self::CoeffReturnType, float>::value ||
843  internal::is_same<typename Self::CoeffReturnType, double>::value ||
844  (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
845 #else // EIGEN_HAS_GPU_FP16
846  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
847  (internal::is_same<typename Self::CoeffReturnType, float>::value ||
848  internal::is_same<typename Self::CoeffReturnType, double>::value);
849 #endif // EIGEN_HAS_GPU_FP16
850 
851  template <typename OutputType>
852  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
853  gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
854  const Index num_coeffs = array_prod(self.m_impl.dimensions());
855  // Don't crash when we're called with an input tensor of size 0.
856  if (num_coeffs == 0) {
857  return true;
858  }
859  // It's faster to use the usual code.
860  if (num_coeffs_to_reduce <= 128) {
861  return true;
862  }
863 
864  return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
865  }
866 };
867 
868 template <int NumPerThread, typename Self,
869  typename Reducer, typename Index>
870 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
871  typename Self::CoeffReturnType* output) {
872  const Index num_threads = blockDim.x * gridDim.x;
873  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
874  // Initialize the output values if they weren't initialized by the ReductionInitKernel
875  if (gridDim.x == 1) {
876  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
877  output[i] = reducer.initialize();
878  }
879  __syncthreads();
880  }
881 
882  // Do the reduction.
883  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
884  for (Index i = thread_id; i < max_iter; i += num_threads) {
885  const Index input_col = i % num_preserved_coeffs;
886  const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
887  typename Self::CoeffReturnType reduced_val = reducer.initialize();
888  const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
889  for (Index j = input_row; j < max_row; j++) {
890  typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
891  reducer.reduce(val, &reduced_val);
892  }
893  atomicReduce(&(output[input_col]), reduced_val, reducer);
894  }
895 }
896 
897 
898 template <typename Self, typename Op>
899 struct OuterReducer<Self, Op, GpuDevice> {
900  // Unfortunately nvidia doesn't support well exotic types such as complex,
901  // so reduce the scope of the optimized version of the code to the simple case
902  // of floats.
903  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
904  (internal::is_same<typename Self::CoeffReturnType, float>::value ||
905  internal::is_same<typename Self::CoeffReturnType, double>::value);
906  template <typename Device, typename OutputType>
907  static
908  #if !defined(EIGEN_HIPCC)
909  // FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
910  // (in the cxx11_tensor_reduction_gpu test)
911  //
912  // terminate called after throwing an instance of 'std::runtime_error'
913  // what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
914  //
915  // don't know why this happens (and why is it a runtime error instead of a compile time error)
916  //
917  // this will be fixed by HIP PR#457
918  EIGEN_DEVICE_FUNC
919  #endif
920  bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
921  gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
922  return true;
923  }
924 
925  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
926  typedef typename Self::Index Index;
927 
928  // It's faster to use the usual code.
929  if (num_coeffs_to_reduce <= 32) {
930  return true;
931  }
932 
933  const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
934  const int block_size = 256;
935  const int num_per_thread = 16;
936  const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
937  const int max_blocks = device.getNumGpuMultiProcessors() *
938  device.maxGpuThreadsPerMultiProcessor() / block_size;
939  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
940 
941  if (num_blocks > 1) {
942  // We initialize the outputs in the reduction kernel itself when we don't have to worry
943  // about race conditions between multiple thread blocks.
944  const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
945  const int max_blocks = device.getNumGpuMultiProcessors() *
946  device.maxGpuThreadsPerMultiProcessor() / 1024;
947  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
948  LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
949  num_blocks, 1024, 0, device, reducer.initialize(),
950  num_preserved_vals, output);
951  }
952 
953  LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
954  num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
955 
956  return false;
957  }
958 };
959 
960 #endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
961 
962 
963 } // end namespace internal
964 } // end namespace Eigen
965 
966 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
Namespace containing all symbols from the Eigen library.
static const symbolic::SymbolExpr< internal::symbolic_last_tag > last
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index