Please, help us to better know about our user community by answering the following short survey: https://forms.gle/wpyrxWi18ox9Z5ae9
TensorReductionSycl.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli Codeplay Software Ltd.
5 // Ralph Potter Codeplay Software Ltd.
6 // Luke Iwanski Codeplay Software Ltd.
7 // Contact: <eigen@codeplay.com>
8 //
9 // This Source Code Form is subject to the terms of the Mozilla
10 // Public License v. 2.0. If a copy of the MPL was not distributed
11 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
12 
13 /*****************************************************************
14  * TensorReductionSycl.h
15  *
16  * \brief:
17  * This is the specialization of the reduction operation. Two phase reduction approach
18  * is used since the GPU does not have Global Synchronization for global memory among
19  * different work-group/thread block. To solve the problem, we need to create two kernels
20  * to reduce the data, where the first kernel reduce the data locally and each local
21  * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
22  * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element.
23  * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
24  * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
25  *
26  *****************************************************************/
27 
28 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
29 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
30 namespace Eigen {
31 namespace TensorSycl {
32 namespace internal {
33 
34 template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
35 struct OpDefiner {
36  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
37  typedef Op type;
38  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
39 
40  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
41  const Index &) {
42  return accumulator;
43  }
44 };
45 
46 template <typename CoeffReturnType, typename Index>
47 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
48  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
49  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
50  return type();
51  }
52 
53  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
54  const Index &scale) {
55  ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
56  return quotient_op(accumulator, CoeffReturnType(scale));
57  }
58 };
59 
60 template <typename CoeffReturnType, typename Index>
61 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
62  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
63  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
64  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
65  return type();
66  }
67 
68  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
69  const Index &scale) {
70  return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
71  }
72 };
73 
74 template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
75  Index local_range>
76 struct SecondStepFullReducer {
77  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
78  LocalAccessor;
79  typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
80  typedef typename OpDef::type Op;
81  LocalAccessor scratch;
82  InputAccessor aI;
83  OutputAccessor outAcc;
84  Op op;
85  SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
86  : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
87 
88  void operator()(cl::sycl::nd_item<1> itemID) {
89  // Our empirical research shows that the best performance will be achieved
90  // when there is only one element per thread to reduce in the second step.
91  // in this step the second step reduction time is almost negligible.
92  // Hence, in the second step of reduction the input size is fixed to the
93  // local size, thus, there is only one element read per thread. The
94  // algorithm must be changed if the number of reduce per thread in the
95  // second step is greater than 1. Otherwise, the result will be wrong.
96  const Index localid = itemID.get_local_id(0);
97  auto aInPtr = aI.get_pointer() + localid;
98  auto aOutPtr = outAcc.get_pointer();
99  CoeffReturnType *scratchptr = scratch.get_pointer();
100  CoeffReturnType accumulator = *aInPtr;
101 
102  scratchptr[localid] = op.finalize(accumulator);
103  for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
104  itemID.barrier(cl::sycl::access::fence_space::local_space);
105  if (localid < offset) {
106  op.reduce(scratchptr[localid + offset], &accumulator);
107  scratchptr[localid] = op.finalize(accumulator);
108  }
109  }
110  if (localid == 0) *aOutPtr = op.finalize(accumulator);
111  }
112 };
113 
114 // Full reduction first phase. In this version the vectorization is true and the reduction accept
115 // any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ).
116 template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
117 class FullReductionKernelFunctor {
118  public:
119  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
120  typedef typename Evaluator::Index Index;
121  typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
122  (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
123  OpDef;
124 
125  typedef typename OpDef::type Op;
126  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
127  typedef typename Evaluator::PacketReturnType PacketReturnType;
128  typedef
129  typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
130  PacketReturnType, CoeffReturnType>::type OutType;
131  typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
132  LocalAccessor;
133  LocalAccessor scratch;
134  Evaluator evaluator;
135  EvaluatorPointerType final_output;
136  Index rng;
137  Op op;
138 
139  FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
140  Index rng_, OpType op_)
141  : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
142 
143  void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
144 
145  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
146  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction(
147  const cl::sycl::nd_item<1> &itemID) {
148  auto output_ptr = final_output.get_pointer();
149  Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
150  Index globalid = itemID.get_global_id(0);
151  Index localid = itemID.get_local_id(0);
152  Index step = Evaluator::PacketSize * itemID.get_global_range(0);
153  Index start = Evaluator::PacketSize * globalid;
154  // vectorizable parts
155  PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
156  for (Index i = start; i < VectorizedRange; i += step) {
157  op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
158  }
159  globalid += VectorizedRange;
160  // non vectorizable parts
161  for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
162  op.template reducePacket<PacketReturnType>(
163  ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
164  evaluator.impl().coeff(i), op.initialize()),
165  &packetAccumulator);
166  }
167  scratch[localid] = packetAccumulator =
168  OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
169  // reduction parts // Local size is always power of 2
170  EIGEN_UNROLL_LOOP
171  for (Index offset = local_range / 2; offset > 0; offset /= 2) {
172  itemID.barrier(cl::sycl::access::fence_space::local_space);
173  if (localid < offset) {
174  op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
175  scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
176  }
177  }
178  if (localid == 0) {
179  output_ptr[itemID.get_group(0)] =
180  op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
181  }
182  }
183 
184  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
185  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction(
186  const cl::sycl::nd_item<1> &itemID) {
187  auto output_ptr = final_output.get_pointer();
188  Index globalid = itemID.get_global_id(0);
189  Index localid = itemID.get_local_id(0);
190  // vectorizable parts
191  CoeffReturnType accumulator = op.initialize();
192  // non vectorizable parts
193  for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
194  op.reduce(evaluator.impl().coeff(i), &accumulator);
195  }
196  scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
197 
198  // reduction parts. the local size is always power of 2
199  EIGEN_UNROLL_LOOP
200  for (Index offset = local_range / 2; offset > 0; offset /= 2) {
201  itemID.barrier(cl::sycl::access::fence_space::local_space);
202  if (localid < offset) {
203  op.reduce(scratch[localid + offset], &accumulator);
204  scratch[localid] = op.finalize(accumulator);
205  }
206  }
207  if (localid == 0) {
208  output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
209  }
210  }
211 };
212 
213 template <typename Evaluator, typename OpType>
214 class GenericNondeterministicReducer {
215  public:
216  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
217  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
218  typedef typename Evaluator::Index Index;
219  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
220  typedef typename OpDef::type Op;
221  template <typename Scratch>
222  GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
223  Index range_, Index num_values_to_reduce_)
224  : evaluator(evaluator_),
225  output_accessor(output_accessor_),
226  functor(OpDef::get_op(functor_)),
227  range(range_),
228  num_values_to_reduce(num_values_to_reduce_) {}
229 
230  void operator()(cl::sycl::nd_item<1> itemID) {
231  auto output_accessor_ptr = output_accessor.get_pointer();
233  Index globalid = static_cast<Index>(itemID.get_global_linear_id());
234  if (globalid < range) {
235  CoeffReturnType accum = functor.initialize();
236  Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
237  evaluator, evaluator.firstInput(globalid), functor, &accum);
238  output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
239  }
240  }
241 
242  private:
243  Evaluator evaluator;
244  EvaluatorPointerType output_accessor;
245  Op functor;
246  Index range;
247  Index num_values_to_reduce;
248 };
249 
250 enum class reduction_dim { inner_most, outer_most };
251 // default is preserver
252 template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
253 struct PartialReductionKernel {
254  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
255  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
256  typedef typename Evaluator::Index Index;
257  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
258  typedef typename OpDef::type Op;
259  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
260  ScratchAcc;
261  ScratchAcc scratch;
262  Evaluator evaluator;
263  EvaluatorPointerType output_accessor;
264  Op op;
265  const Index preserve_elements_num_groups;
266  const Index reduce_elements_num_groups;
267  const Index num_coeffs_to_preserve;
268  const Index num_coeffs_to_reduce;
269 
270  PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
271  const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
272  const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
273  : scratch(scratch_),
274  evaluator(evaluator_),
275  output_accessor(output_accessor_),
276  op(OpDef::get_op(op_)),
277  preserve_elements_num_groups(preserve_elements_num_groups_),
278  reduce_elements_num_groups(reduce_elements_num_groups_),
279  num_coeffs_to_preserve(num_coeffs_to_preserve_),
280  num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
281 
282  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
283  CoeffReturnType &accumulator) {
284  if (globalPId >= num_coeffs_to_preserve) {
285  return;
286  }
287  Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
288  : globalRId + (globalPId * num_coeffs_to_reduce);
289  Index localOffset = globalRId;
290 
291  const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
292  const Index per_thread_global_stride =
293  rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
294  for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
295  op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
296  localOffset += per_thread_local_stride;
297  global_offset += per_thread_global_stride;
298  }
299  }
300  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
301  const Index linearLocalThreadId = itemID.get_local_id(0);
302  Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
303  : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
304  Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
305  : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
306  const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
307  : itemID.get_group(0) / reduce_elements_num_groups;
308  const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
309  : itemID.get_group(0) % reduce_elements_num_groups;
310 
311  Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
312  const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
313  auto scratchPtr = scratch.get_pointer().get();
314  auto outPtr =
315  output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
316  CoeffReturnType accumulator = op.initialize();
317 
318  element_wise_reduce(globalRId, globalPId, accumulator);
319 
320  accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
321  scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
322  accumulator;
323  if (rt == reduction_dim::inner_most) {
324  pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
325  rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
326  globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
327  }
328 
329  /* Apply the reduction operation between the current local
330  * id and the one on the other half of the vector. */
331  auto out_scratch_ptr =
332  scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
333  itemID.barrier(cl::sycl::access::fence_space::local_space);
334  if (rt == reduction_dim::inner_most) {
335  accumulator = *out_scratch_ptr;
336  }
337  // The Local LocalThreadSizeR is always power of 2
338  EIGEN_UNROLL_LOOP
339  for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
340  if (rLocalThreadId < offset) {
341  op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
342  // The result has already been divided for mean reducer in the
343  // previous reduction so no need to divide furthermore
344  *out_scratch_ptr = op.finalize(accumulator);
345  }
346  /* All threads collectively read from global memory into local.
347  * The barrier ensures all threads' IO is resolved before
348  * execution continues (strictly speaking, all threads within
349  * a single work-group - there is no co-ordination between
350  * work-groups, only work-items). */
351  itemID.barrier(cl::sycl::access::fence_space::local_space);
352  }
353 
354  if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
355  outPtr[globalPId] = op.finalize(accumulator);
356  }
357  }
358 };
359 
360 template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
361 struct SecondStepPartialReduction {
362  typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
363  typedef typename OpDef::type Op;
364  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
365  ScratchAccessor;
366  InputAccessor input_accessor;
367  OutputAccessor output_accessor;
368  Op op;
369  const Index num_coeffs_to_preserve;
370  const Index num_coeffs_to_reduce;
371 
372  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
373  OutputAccessor output_accessor_, OpType op_,
374  const Index num_coeffs_to_preserve_,
375  const Index num_coeffs_to_reduce_)
376  : input_accessor(input_accessor_),
377  output_accessor(output_accessor_),
378  op(OpDef::get_op(op_)),
379  num_coeffs_to_preserve(num_coeffs_to_preserve_),
380  num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
381 
382  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
383  const Index globalId = itemID.get_global_id(0);
384 
385  if (globalId >= num_coeffs_to_preserve) return;
386 
387  auto in_ptr = input_accessor.get_pointer() + globalId;
388 
389  OutScalar accumulator = op.initialize();
390 // num_coeffs_to_reduce is not bigger that 256
391  for (Index i = 0; i < num_coeffs_to_reduce; i++) {
392  op.reduce(*in_ptr, &accumulator);
393  in_ptr += num_coeffs_to_preserve;
394  }
395  output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
396  }
397 }; // namespace internal
398 
399 template <typename Index, Index LTP, Index LTR, bool BC_>
400 struct ReductionPannel {
401  static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
402  static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
403  static EIGEN_CONSTEXPR bool BC = BC_;
404 };
405 
406 template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
407 struct PartialReducerLauncher {
408  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
409  typedef typename Self::CoeffReturnType CoeffReturnType;
410  typedef typename Self::Storage Storage;
411  typedef typename Self::Index Index;
412  typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
413  PannelParameters;
414 
415  typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
416 
417  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
418  Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
419  Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
420 
421  // getPowerOfTwo makes sure local range is power of 2 and <=
422  // maxSyclThreadPerBlock this will help us to avoid extra check on the
423  // kernel
424  static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
425  (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
426  "The Local thread size must be a power of 2 for the reduction "
427  "operation");
428 
429  EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
430  // In this step, we force the code not to be more than 2-step reduction:
431  // Our empirical research shows that if each thread reduces at least 64
432  // elemnts individually, we get better performance. However, this can change
433  // on different platforms. In this step we force the code not to be
434  // morthan step reduction: Our empirical research shows that for inner_most
435  // dim reducer, it is better to have 8 group in a reduce dimension for sizes
436  // > 1024 to achieve the best performance.
437  const Index reductionPerThread = 64;
438  Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
439  const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
440  Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
441  const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
442  const Index globalRange = pNumGroups * rNumGroups * localRange;
443 
444  EIGEN_CONSTEXPR Index scratchSize =
445  PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
446  auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
447  if (rNumGroups > 1) {
448  CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
449  dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
450  EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
451  dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
452  self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
453  num_coeffs_to_reduce);
454 
455  typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
456  SecondStepPartialReductionKernel;
457 
458  dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
459  temp_accessor, output,
460  cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
461  reducer, num_coeffs_to_preserve, rNumGroups);
462 
463  self.device().deallocate_temp(temp_pointer);
464  } else {
465  dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
466  self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
467  num_coeffs_to_reduce);
468  }
469  return false;
470  }
471 };
472 } // namespace internal
473 } // namespace TensorSycl
474 
475 namespace internal {
476 
477 template <typename Self, typename Op, bool Vectorizable>
478 struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
479  typedef typename Self::CoeffReturnType CoeffReturnType;
480  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
481  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
482  static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
483  static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
484  typedef typename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType;
485  static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
486  (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
487  "The Local thread size must be a power of 2 for the reduction "
488  "operation");
489  EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
490 
491  typename Self::Index inputSize = self.impl().dimensions().TotalSize();
492  // In this step we force the code not to be more than 2-step reduction:
493  // Our empirical research shows that if each thread reduces at least 512
494  // elemnts individually, we get better performance.
495  const Index reductionPerThread = 2048;
496  // const Index num_work_group =
497  Index reductionGroup = dev.getPowerOfTwo(
498  (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
499  const Index num_work_group = std::min(reductionGroup, local_range);
500  // 1
501  // ? local_range
502  // : 1);
503  const Index global_range = num_work_group * local_range;
504 
505  auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
506  typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
507  if (num_work_group > 1) {
508  CoeffReturnType *temp_pointer =
509  static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
510  typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
511  dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
512  local_range, inputSize, reducer);
513 
514  typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
515  EvaluatorPointerType, Index, local_range>
516  GenericRKernel;
517  dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
518  tmp_global_accessor, data,
519  cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
520  reducer);
521 
522  dev.deallocate_temp(temp_pointer);
523  } else {
524  dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
525  reducer);
526  }
527  }
528 };
529 // vectorizable inner_most most dim preserver
530 // col reduction
531 template <typename Self, typename Op>
532 struct OuterReducer<Self, Op, Eigen::SyclDevice> {
533  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
534 
535  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
536  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
537  typename Self::Index num_coeffs_to_preserve) {
538  return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
539  Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
540  num_coeffs_to_reduce,
541  num_coeffs_to_preserve);
542  }
543 };
544 // row reduction
545 template <typename Self, typename Op>
546 struct InnerReducer<Self, Op, Eigen::SyclDevice> {
547  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
548 
549  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
550  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
551  typename Self::Index num_coeffs_to_preserve) {
552  return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
553  Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
554  num_coeffs_to_reduce,
555  num_coeffs_to_preserve);
556  }
557 };
558 
559 // ArmgMax uses this kernel for partial reduction//
560 // TODO(@mehdi.goli) come up with a better kernel
561 // generic partial reduction
562 template <typename Self, typename Op>
563 struct GenericReducer<Self, Op, Eigen::SyclDevice> {
564  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
565  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
566  typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
567  typename Self::Index num_coeffs_to_preserve) {
568  typename Self::Index range, GRange, tileSize;
569  dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
570 
571  dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
572  TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
573  self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
574  reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
575  return false;
576  }
577 };
578 
579 } // namespace internal
580 } // namespace Eigen
581 
582 #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index