Please, help us to better know about our user community by answering the following short survey: https://forms.gle/wpyrxWi18ox9Z5ae9
TensorConvolutionSycl.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli Codeplay Software Ltd.
5 // Ralph Potter Codeplay Software Ltd.
6 // Luke Iwanski Codeplay Software Ltd.
7 // Contact: <eigen@codeplay.com>
8 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
9 
10 //
11 // This Source Code Form is subject to the terms of the Mozilla
12 // Public License v. 2.0. If a copy of the MPL was not distributed
13 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
14 
15 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
16 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
17 
18 namespace Eigen {
19 
28 enum class convolution_type { CONV1D, CONV2D, CONV3D };
29 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
30  typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
31 struct EigenConvolutionKernel;
32 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
33  typename Kernel_accessor, typename Buffer_accessor>
34 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
35  Buffer_accessor, convolution_type::CONV1D> {
36  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
37  Local_accessor;
38  Local_accessor local_acc;
39  Evaluator device_evaluator;
40  Kernel_accessor kernel_filter;
41  Buffer_accessor buffer_acc;
42  internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
43  const size_t kernelSize;
44  const cl::sycl::range<2> input_range;
45  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
46  Buffer_accessor buffer_acc_,
47  internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
48  const size_t kernelSize_, const cl::sycl::range<2> input_range_)
49  : local_acc(local_acc_),
50  device_evaluator(device_evaluator_),
51  kernel_filter(kernel_filter_),
52  buffer_acc(buffer_acc_),
53  indexMapper(indexMapper_),
54  kernelSize(kernelSize_),
55  input_range(input_range_) {}
56 
57  template <typename BooleanDim2>
58  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
59  return (boolean_check[0] && boolean_check[1]);
60  }
61  void operator()(cl::sycl::nd_item<2> itemID) {
62  auto buffer_ptr = buffer_acc.get_pointer();
63  auto kernel_ptr = kernel_filter.get_pointer();
64  // the required row to be calculated for the for each plane in shered memory
65  const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
66  const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
67  const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
68  const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
70  for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
71  const size_t local_index = i + plane_kernel_offset;
72  const size_t tensor_index =
73  plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
74 
75  local_acc[local_index] =
76  (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
77  ? device_evaluator.coeff(tensor_index)
78  : CoeffReturnType(0);
79  }
80 
81  itemID.barrier(cl::sycl::access::fence_space::local_space);
82 
83  // calculate the convolution // output start x
84  const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
85  if (boundary_check(itemID.get_global_id() < input_range)) {
86  CoeffReturnType result = static_cast<CoeffReturnType>(0);
87  const size_t index = plane_kernel_offset + itemID.get_local_id(0);
88  for (size_t k = 0; k < kernelSize; ++k) {
89  result += (local_acc[k + index] * kernel_ptr[k]);
90  }
91  const size_t tensor_index =
92  indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
93  indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
94  buffer_ptr[tensor_index] = result;
95  }
96  }
97 };
98 
99 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
100  typename Kernel_accessor, typename Buffer_accessor>
101 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
102  Buffer_accessor, convolution_type::CONV2D> {
103  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
104  Local_accessor;
105  Local_accessor local_acc;
106  Evaluator device_evaluator;
107  Kernel_accessor kernel_filter;
108  Buffer_accessor buffer_acc;
109  internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
110  const cl::sycl::range<2> kernel_size;
111  const cl::sycl::range<3> input_range;
112  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
113  Buffer_accessor buffer_acc_,
114  internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
115  const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
116  : local_acc(local_acc_),
117  device_evaluator(device_evaluator_),
118  kernel_filter(kernel_filter_),
119  buffer_acc(buffer_acc_),
120  indexMapper(indexMapper_),
121  kernel_size(kernel_size_),
122  input_range(input_range_) {}
123  template <typename BooleanDim3>
124  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
125  return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
126  }
127 
128  void operator()(cl::sycl::nd_item<3> itemID) {
129  auto buffer_ptr = buffer_acc.get_pointer();
130  auto kernel_ptr = kernel_filter.get_pointer();
131  // the required row to be calculated for the for each plane in shered memory
132  const auto num_input = cl::sycl::range<2>{
133  (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
134 
135  const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
136  const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
137 
138  const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
139  itemID.get_group(1) * itemID.get_local_range()[1]};
140 
141  // fill the local memory
142  bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
143  for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
144  const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
145  bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
146  for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
147  const size_t local_index = i + local_input_offset;
148  const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
149  i + input_offset[0], j + input_offset[1]);
150  local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
151  in_range_dim1 && in_range_dim2)
152  ? device_evaluator.coeff(tensor_index)
153  : CoeffReturnType(0);
154  }
155  }
156 
157  itemID.barrier(cl::sycl::access::fence_space::local_space);
158 
159  // output offset start for each thread
160  const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
161  itemID.get_group(1) * itemID.get_local_range()[1]};
162 
163  if (boundary_check(itemID.get_global_id() < input_range)) {
164  CoeffReturnType result = static_cast<CoeffReturnType>(0);
165 
166  for (size_t j = 0; j < kernel_size[1]; j++) {
167  size_t kernel_offset = kernel_size[0] * j;
168  const size_t index =
169  (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
170  for (size_t i = 0; i < kernel_size[0]; i++) {
171  result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
172  }
173  }
174  const size_t tensor_index =
175  indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
176  indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
177  itemID.get_local_id(1) + output_offset[1]);
178 
179  buffer_ptr[tensor_index] = result;
180  }
181  }
182 };
183 
184 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
185  typename Kernel_accessor, typename Buffer_accessor>
186 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
187  Buffer_accessor, convolution_type::CONV3D> {
188  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
189  Local_accessor;
190  Local_accessor local_acc;
191  Evaluator device_evaluator;
192  Kernel_accessor kernel_filter;
193  Buffer_accessor buffer_acc;
194  internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
195  const cl::sycl::range<3> kernel_size;
196  const cl::sycl::range<3> input_range;
197  const size_t numP;
198 
199  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
200  Buffer_accessor buffer_acc_,
201  internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
202  const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
203  const size_t numP_)
204  : local_acc(local_acc_),
205  device_evaluator(device_evaluator_),
206  kernel_filter(kernel_filter_),
207  buffer_acc(buffer_acc_),
208  indexMapper(indexMapper_),
209  kernel_size(kernel_size_),
210  input_range(input_range_),
211  numP(numP_) {}
212  template <typename BooleanDim3>
213  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
214  return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
215  }
216  void operator()(cl::sycl::nd_item<3> itemID) {
217  auto buffer_ptr = buffer_acc.get_pointer();
218  auto kernel_ptr = kernel_filter.get_pointer();
219  const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
220 
221  const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
222 
223  const auto output_offset =
224  cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
225 
226  for (size_t p = 0; p < numP; p++) {
228  const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
229  for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
230  size_t local_index_dim2 = num_input[0] * num_input[1] * k;
231  bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
232  for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
233  bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
234  size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2;
235  for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
236  bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
237  const size_t local_index = local_index_dim1 + i;
238  const size_t tensor_index =
239  plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
240  i + input_offset[0], j + input_offset[1], k + input_offset[2]);
241  local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
242  }
243  }
244  }
245  itemID.barrier(cl::sycl::access::fence_space::local_space);
246 
247  // calculate the convolution
248 
249  if (boundary_check(itemID.get_global_id() < input_range)) {
250  CoeffReturnType result = static_cast<CoeffReturnType>(0);
251  for (size_t k = 0; k < kernel_size[2]; k++) {
252  for (size_t j = 0; j < kernel_size[1]; j++) {
253  for (size_t i = 0; i < kernel_size[0]; i++) {
254  const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
255  const size_t local_index =
256  ((i + itemID.get_local_id(0)) +
257  num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
258 
259  result += (local_acc[local_index] * kernel_ptr[kernel_index]);
260  }
261  }
262  }
263  const size_t tensor_index =
264  indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
265  indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
266  buffer_ptr[tensor_index] = result;
267  }
268 
269  itemID.barrier(cl::sycl::access::fence_space::local_space);
270  }
271  }
272 };
273 
274 template <typename Indices, typename InputArgType, typename KernelArgType>
275 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
276  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
277 
278  static const int NumDims =
279  internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
280  static const int NumKernelDims = internal::array_size<Indices>::value;
281  typedef typename XprType::Index Index;
282  typedef DSizes<Index, NumDims> Dimensions;
283  typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
284  typedef const Eigen::SyclDevice Device;
285  typedef typename XprType::CoeffReturnType CoeffReturnType;
286  typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
287  typedef typename InputArgType::Scalar Scalar;
288  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
289  typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
290  typedef typename Storage::Type EvaluatorPointerType;
291  typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
292 
293  enum {
294  IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
295  TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
296  PacketAccess = false,
297  BlockAccess = false,
298  PreferBlockAccess = false,
299  Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout,
300  CoordAccess = false, // to be implemented
301  RawAccess = false
302  };
303 
304  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
305  typedef internal::TensorBlockNotImplemented TensorBlock;
306  //===--------------------------------------------------------------------===//
307 
308  TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
309  : m_inputImpl(op.inputExpression(), device),
310  m_kernelArg(op.kernelExpression()),
311  m_kernelImpl(op.kernelExpression(), device),
312  m_indices(op.indices()),
313  m_buf(NULL),
314  m_kernel(NULL),
315  m_local_kernel(false),
316  m_device(device) {
317  EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
318  static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
319  YOU_MADE_A_PROGRAMMING_MISTAKE);
320 
321  const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
322  const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
323  m_kernelImpl.dimensions();
324 
325  m_dimensions = m_inputImpl.dimensions();
326  for (int i = 0; i < NumKernelDims; ++i) {
327  const Index index = op.indices()[i];
328  const Index input_dim = input_dims[index];
329  const Index kernel_dim = kernel_dims[i];
330  const Index result_dim = input_dim - kernel_dim + 1;
331  m_dimensions[index] = result_dim;
332  }
333  }
334 
335  EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
336 
337  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
338  preloadKernel();
339  m_inputImpl.evalSubExprsIfNeeded(NULL);
340  if (data) {
341  executeEval(data);
342  return false;
343  } else {
344  m_buf = (EvaluatorPointerType)m_device.get(
345  (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
346  executeEval(m_buf);
347  return true;
348  }
349  }
350 
351  EIGEN_STRONG_INLINE void cleanup() {
352  m_inputImpl.cleanup();
353  if (m_buf) {
354  m_device.deallocate_temp(m_buf);
355  m_buf = NULL;
356  }
357  if (m_local_kernel) {
358  m_device.deallocate_temp(m_kernel);
359  m_local_kernel = false;
360  }
361  m_kernel = NULL;
362  }
364  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
366  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
367 
368  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
369  // Don't make a local copy of the kernel unless we have to (i.e. it's an
370  // expression that needs to be evaluated)
371  typename KernelStorage::Type in_place = m_kernelImpl.data();
372  if (in_place) {
373  m_kernel = in_place;
374  m_local_kernel = false;
375  } else {
376  ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
377  EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
378  typedef TensorEvalToOp<const KernelArgType> EvalTo;
379  EvalTo evalToTmp(m_device.get(local), m_kernelArg);
380  const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
381  internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
382  m_kernel = local;
383  m_local_kernel = true;
384  }
385  }
386 
387  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
388  typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
389  typedef typename InputEvaluator::Dimensions InputDims;
390  switch (NumKernelDims) {
391  case 1: {
392  const size_t numX = dimensions()[m_indices[0]];
393  const size_t numP = dimensions().TotalSize() / numX;
394  const auto input_dim = std::array<size_t, 2>{numX, numP};
395  auto global_range = cl::sycl::range<2>{};
396  auto local_range = cl::sycl::range<2>{};
397  const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
398 
399  m_device.parallel_for_setup(input_dim, global_range, local_range);
400  const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
401  gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
402  const array<Index, 1> indices{{m_indices[0]}};
403  const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
404  internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
405 
406  typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
407  typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
408  ConvKernel;
409 
410  m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
411  m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
412  indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
413  break;
414  }
415 
416  case 2: {
417  auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
418  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
419  auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
420  (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
421  const size_t numX = dimensions()[m_indices[kernel_index[0]]];
422  const size_t numY = dimensions()[m_indices[kernel_index[1]]];
423  const size_t numP = dimensions().TotalSize() / (numX * numY);
424  auto input_dim = std::array<size_t, 3>{numX, numY, numP};
425 
426  auto global_range = cl::sycl::range<3>{};
427  auto local_range = cl::sycl::range<3>{};
428 
429  m_device.parallel_for_setup(input_dim, global_range, local_range);
430 
431  const size_t local_memory_size =
432  (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
433  gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
434  const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
435  const array<Index, 2> kernel_dims{
436  {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
437  internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
438  typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
439  typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
440  ConvKernel;
441  m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
442  m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
443  indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
444  break;
445  }
446 
447  case 3: {
448  auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
449  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
450  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
451 
452  auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
453  (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
454  (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
455 
456  const size_t numX = dimensions()[m_indices[kernel_index[0]]];
457  const size_t numY = dimensions()[m_indices[kernel_index[1]]];
458  const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
459  auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
460  const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
461 
462  const array<Index, 3> indices{
463  {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
464  const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
465  m_kernelImpl.dimensions()[kernel_index[1]],
466  m_kernelImpl.dimensions()[kernel_index[2]]}};
467 
468  internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
469 
470  auto global_range = cl::sycl::range<3>{};
471  auto local_range = cl::sycl::range<3>{};
472 
473  m_device.parallel_for_setup(input_dim, global_range, local_range);
474  auto local_memory_range = (local_range + kernel_size - 1);
475  const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
476 
477  gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
478  typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
479  typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
480  ConvKernel;
481  m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
482  m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
483  indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
484  break;
485  }
486 
487  default: {
488  EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
489  THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
490  }
491  }
492  }
493 
494  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
495  eigen_assert(m_buf != NULL);
496  eigen_assert(index < m_dimensions.TotalSize());
497  return m_buf[index];
498  }
499 
500  template <int LoadMode>
501  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
502  eigen_assert(m_buf != NULL);
503  eigen_assert(index < m_dimensions.TotalSize());
504  return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
505  }
506 
507  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
508  // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
509  // model.
510  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
511  // We ignore the use of fused multiply-add.
512  const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
513  const double firstIndex_compute_cost =
514  NumDims *
515  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
516  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
517  kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
518  TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
519  }
520  // binding placeholder accessors to a command group handler for SYCL
521  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
522  m_kernelImpl.bind(cgh);
523  m_inputImpl.bind(cgh);
524  m_buf.bind(cgh);
525  m_kernel.bind(cgh);
526  }
527 
528  private:
529  // No assignment (copies are needed by the kernels)
530  TensorEvaluator &operator=(const TensorEvaluator &);
531  TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
532  KernelArgType m_kernelArg;
533  TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
534  Indices m_indices;
535  Dimensions m_dimensions;
536  EvaluatorPointerType m_buf;
537  typename KernelStorage::Type m_kernel;
538  bool m_local_kernel;
539  const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
540 }; // namespace Eigen
541 
542 } // end namespace Eigen
543 
544 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index