Please, help us to better know about our user community by answering the following short survey: https://forms.gle/wpyrxWi18ox9Z5ae9
TensorDeviceSycl.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli Codeplay Software Ltd.
5 // Ralph Potter Codeplay Software Ltd.
6 // Luke Iwanski Codeplay Software Ltd.
7 // Contact: <eigen@codeplay.com>
8 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
9 
10 //
11 // This Source Code Form is subject to the terms of the Mozilla
12 // Public License v. 2.0. If a copy of the MPL was not distributed
13 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
14 
15 #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
16 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
17 #include <unordered_set>
18 
19 namespace Eigen {
20 
21 namespace TensorSycl {
22 namespace internal {
23 
25 struct SyclDeviceInfo {
26  SyclDeviceInfo(cl::sycl::queue queue)
27  : local_mem_type(
28  queue.get_device()
29  .template get_info<cl::sycl::info::device::local_mem_type>()),
30  max_work_item_sizes(
31  queue.get_device()
32  .template get_info<
33  cl::sycl::info::device::max_work_item_sizes>()),
34  max_mem_alloc_size(
35  queue.get_device()
36  .template get_info<
37  cl::sycl::info::device::max_mem_alloc_size>()),
38  max_compute_units(queue.get_device()
39  .template get_info<
40  cl::sycl::info::device::max_compute_units>()),
41  max_work_group_size(
42  queue.get_device()
43  .template get_info<
44  cl::sycl::info::device::max_work_group_size>()),
45  local_mem_size(
46  queue.get_device()
47  .template get_info<cl::sycl::info::device::local_mem_size>()),
48  platform_name(queue.get_device()
49  .get_platform()
50  .template get_info<cl::sycl::info::platform::name>()),
51  device_name(queue.get_device()
52  .template get_info<cl::sycl::info::device::name>()),
53  device_vendor(
54  queue.get_device()
55  .template get_info<cl::sycl::info::device::vendor>()) {}
56 
57  cl::sycl::info::local_mem_type local_mem_type;
58  cl::sycl::id<3> max_work_item_sizes;
59  unsigned long max_mem_alloc_size;
60  unsigned long max_compute_units;
61  unsigned long max_work_group_size;
62  size_t local_mem_size;
63  std::string platform_name;
64  std::string device_name;
65  std::string device_vendor;
66 };
67 
68 } // end namespace internal
69 } // end namespace TensorSycl
70 
71 typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
72 // All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
73 // can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
74 // TensorFlow via the Eigen SYCL Backend.
75 EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
76  -> decltype(cl::sycl::device::get_devices()) {
77 #ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
78  return {cl::sycl::device(cl::sycl::default_selector())};
79 #else
80  std::vector<cl::sycl::device> supported_devices;
81  auto platform_list = cl::sycl::platform::get_platforms();
82  for (const auto &platform : platform_list) {
83  auto device_list = platform.get_devices();
84  auto platform_name =
85  platform.template get_info<cl::sycl::info::platform::name>();
86  std::transform(platform_name.begin(), platform_name.end(),
87  platform_name.begin(), ::tolower);
88  for (const auto &device : device_list) {
89  auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
90  std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
91  bool unsupported_condition =
92  (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
93  vendor.find("apu") == std::string::npos) ||
94  (platform_name.find("experimental") != std::string::npos) ||
95  device.is_host();
96  if (!unsupported_condition) {
97  supported_devices.push_back(device);
98  }
99  }
100  }
101  return supported_devices;
102 #endif
103 }
104 
105 class QueueInterface {
106  public:
108  template <typename DeviceOrSelector>
109  explicit QueueInterface(
110  const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
111  unsigned num_threads = std::thread::hardware_concurrency())
112  : m_queue(dev_or_sel, handler),
113 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
114  m_prog(m_queue.get_context(), get_sycl_supported_devices()),
115 #endif
116  m_thread_pool(num_threads),
117  m_device_info(m_queue) {
118 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
119  m_prog.build_with_kernel_type<DeviceOrSelector>();
120  auto f = [&](cl::sycl::handler &cgh) {
121  cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
122  [=]() {})
123  };
124  EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
125 #endif
126  }
127 
128  template <typename DeviceOrSelector>
129  explicit QueueInterface(
130  const DeviceOrSelector &dev_or_sel,
131  unsigned num_threads = std::thread::hardware_concurrency())
132  : QueueInterface(dev_or_sel,
133  [this](cl::sycl::exception_list l) {
134  this->exception_caught_ = this->sycl_async_handler(l);
135  },
136  num_threads) {}
137 
138 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
139  EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
140 #endif
141 
143  EIGEN_STRONG_INLINE void *attach_buffer(
144  cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
145  std::lock_guard<std::mutex> lock(pmapper_mutex_);
146  return static_cast<void *>(pMapper.add_pointer(buf));
147  }
148 
150  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
151  std::lock_guard<std::mutex> lock(pmapper_mutex_);
152  TensorSycl::internal::SYCLfree<false>(p, pMapper);
153  }
154 
163  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
164 #if EIGEN_MAX_ALIGN_BYTES > 0
165  size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
166  if (align > 0) {
167  num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
168  }
169 #endif
170  std::lock_guard<std::mutex> lock(pmapper_mutex_);
171  return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
172  }
173 
174  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
175 #if EIGEN_MAX_ALIGN_BYTES > 0
176  size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
177  if (align > 0) {
178  num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
179  }
180 #endif
181  std::lock_guard<std::mutex> lock(pmapper_mutex_);
182 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
183  if (scratch_buffers.empty()) {
184  return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
185  ;
186  } else {
187  for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
188  auto buff = pMapper.get_buffer(*it);
189  if (buff.get_size() >= num_bytes) {
190  auto ptr = *it;
191  scratch_buffers.erase(it);
192  return ptr;
193  } else {
194  ++it;
195  }
196  }
197  return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
198  }
199 #else
200  return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
201 #endif
202  }
203  template <typename data_t>
204  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
205  cl::sycl::access::mode::read_write, data_t>
206  get(data_t *data) const {
207  return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
208  }
209  template <typename data_t>
210  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
211  TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
212  data_t>
213  data) const {
214  return static_cast<data_t *>(data.get_virtual_pointer());
215  }
216 
217  EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
218  std::lock_guard<std::mutex> lock(pmapper_mutex_);
219 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
220  scratch_buffers.insert(p);
221 #else
222  TensorSycl::internal::SYCLfree(p, pMapper);
223 #endif
224  }
225  template <cl::sycl::access::mode AcMd, typename T>
226  EIGEN_STRONG_INLINE void deallocate_temp(
227  const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
228  deallocate_temp(p.get_virtual_pointer());
229  }
230 
233  EIGEN_STRONG_INLINE void deallocate(void *p) const {
234  std::lock_guard<std::mutex> lock(pmapper_mutex_);
235  TensorSycl::internal::SYCLfree(p, pMapper);
236  }
237 
238  EIGEN_STRONG_INLINE void deallocate_all() const {
239  std::lock_guard<std::mutex> lock(pmapper_mutex_);
240  TensorSycl::internal::SYCLfreeAll(pMapper);
241 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
242  scratch_buffers.clear();
243 #endif
244  }
245 
250  EIGEN_STRONG_INLINE void memcpyHostToDevice(
251  void *dst, const void *src, size_t n,
252  std::function<void()> callback) const {
253  static const auto write_mode = cl::sycl::access::mode::discard_write;
254  static const auto global_access = cl::sycl::access::target::global_buffer;
255  typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
256  write_accessor;
257  if (n == 0) {
258  if (callback) callback();
259  return;
260  }
261  n /= sizeof(buffer_scalar_t);
262  auto f = [&](cl::sycl::handler &cgh) {
263  write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
264  buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
265  auto non_deleter = [](buffer_scalar_t const *) {};
266  std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
267  cgh.copy(s_ptr, dst_acc);
268  };
269  cl::sycl::event e;
270  EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
271  synchronize_and_callback(e, callback);
272  }
273 
278  EIGEN_STRONG_INLINE void memcpyDeviceToHost(
279  void *dst, const void *src, size_t n,
280  std::function<void()> callback) const {
281  static const auto read_mode = cl::sycl::access::mode::read;
282  static const auto global_access = cl::sycl::access::target::global_buffer;
283  typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
284  read_accessor;
285  if (n == 0) {
286  if (callback) callback();
287  return;
288  }
289  n /= sizeof(buffer_scalar_t);
290  auto f = [&](cl::sycl::handler &cgh) {
291  read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
292  buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
293  auto non_deleter = [](buffer_scalar_t *) {};
294  std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
295  cgh.copy(src_acc, s_ptr);
296  };
297  cl::sycl::event e;
298  EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
299  synchronize_and_callback(e, callback);
300  }
301 
305  EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
306  static const auto read_mode = cl::sycl::access::mode::read;
307  static const auto write_mode = cl::sycl::access::mode::discard_write;
308  if (n == 0) {
309  return;
310  }
311  n /= sizeof(buffer_scalar_t);
312  auto f = [&](cl::sycl::handler &cgh) {
313  auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
314  auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
315  cgh.copy(src_acc, dst_acc);
316  };
317  cl::sycl::event e;
318  EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
319  async_synchronize(e);
320  }
321 
325  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
326  static const auto write_mode = cl::sycl::access::mode::discard_write;
327  if (n == 0) {
328  return;
329  }
330  n /= sizeof(buffer_scalar_t);
331  auto f = [&](cl::sycl::handler &cgh) {
332  auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
333  // The cast to uint8_t is here to match the behaviour of the standard
334  // memset. The cast to buffer_scalar_t is needed to match the type of the
335  // accessor (in case buffer_scalar_t is not uint8_t)
336  cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
337  };
338  cl::sycl::event e;
339  EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
340  async_synchronize(e);
341  }
342 
350  template <cl::sycl::access::mode AcMd, typename T>
351  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
352  get_range_accessor(const void *ptr) const {
353  static const auto global_access = cl::sycl::access::target::global_buffer;
354  static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
355  typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
356  typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
357 
358  std::lock_guard<std::mutex> lock(pmapper_mutex_);
359 
360  auto original_buffer = pMapper.get_buffer(ptr);
361  const ptrdiff_t offset = pMapper.get_offset(ptr);
362  const ptrdiff_t typed_offset = offset / sizeof(T);
363  eigen_assert(typed_offset >= 0);
364  const auto typed_size = original_buffer.get_size() / sizeof(T);
365  auto buffer = original_buffer.template reinterpret<
366  typename Eigen::internal::remove_const<T>::type>(
367  cl::sycl::range<1>(typed_size));
368  const ptrdiff_t size = buffer.get_count() - typed_offset;
369  eigen_assert(size >= 0);
370  typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type,
371  1, AcMd, global_access, is_place_holder>
372  placeholder_accessor_t;
373  const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
374  return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
375  cl::sycl::id<1>(typed_offset)),
376  static_cast<size_t>(typed_offset),
377  reinterpret_cast<std::intptr_t>(start_ptr));
378  }
379 
382  template <cl::sycl::access::mode AcMd, typename Index>
383  EIGEN_STRONG_INLINE cl::sycl::accessor<
384  buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
385  get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
386  const Index n_bytes) const {
387  static const auto global_access = cl::sycl::access::target::global_buffer;
388  eigen_assert(n_bytes >= 0);
389  std::lock_guard<std::mutex> lock(pmapper_mutex_);
390  auto buffer = pMapper.get_buffer(ptr);
391  const ptrdiff_t offset = pMapper.get_offset(ptr);
392  eigen_assert(offset >= 0);
393  eigen_assert(offset + n_bytes <= buffer.get_size());
394  return buffer.template get_access<AcMd, global_access>(
395  cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
396  }
397 
402  template <cl::sycl::access::mode AcMd>
403  EIGEN_STRONG_INLINE cl::sycl::accessor<
404  buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
405  get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
406  std::lock_guard<std::mutex> lock(pmapper_mutex_);
407  return pMapper.get_buffer(ptr)
408  .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
409  cgh);
410  }
411 
412  EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
413  const void *ptr) const {
414  std::lock_guard<std::mutex> lock(pmapper_mutex_);
415  return pMapper.get_buffer(ptr);
416  }
417 
418  EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
419  std::lock_guard<std::mutex> lock(pmapper_mutex_);
420  return pMapper.get_offset(ptr);
421  }
422 
423  template <typename OutScalar, typename sycl_kernel, typename Lhs,
424  typename Rhs, typename OutPtr, typename Range, typename Index,
425  typename... T>
426  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
427  const Rhs &rhs, OutPtr outptr,
428  Range thread_range,
429  Index scratchSize,
430  T... var) const {
431  auto kernel_functor = [=](cl::sycl::handler &cgh) {
432  // binding the placeholder accessors to a commandgroup handler
433  lhs.bind(cgh);
434  rhs.bind(cgh);
435  outptr.bind(cgh);
436  typedef cl::sycl::accessor<OutScalar, 1,
437  cl::sycl::access::mode::read_write,
438  cl::sycl::access::target::local>
439  LocalAccessor;
440 
441  LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
442  cgh.parallel_for(
443 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
444  program().template get_kernel<sycl_kernel>(),
445 #endif
446  thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
447  };
448  cl::sycl::event e;
449  EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
450  async_synchronize(e);
451  }
452 
453  template <typename OutScalar, typename sycl_kernel, typename InPtr,
454  typename OutPtr, typename Range, typename Index, typename... T>
455  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
456  OutPtr &outptr,
457  Range thread_range,
458  Index scratchSize,
459  T... var) const {
460  auto kernel_functor = [=](cl::sycl::handler &cgh) {
461  // binding the placeholder accessors to a commandgroup handler
462  inptr.bind(cgh);
463  outptr.bind(cgh);
464  typedef cl::sycl::accessor<OutScalar, 1,
465  cl::sycl::access::mode::read_write,
466  cl::sycl::access::target::local>
467  LocalAccessor;
468 
469  LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
470  cgh.parallel_for(
471 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
472  program().template get_kernel<sycl_kernel>(),
473 #endif
474  thread_range, sycl_kernel(scratch, inptr, outptr, var...));
475  };
476  cl::sycl::event e;
477  EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
478  async_synchronize(e);
479  }
480 
481  template <typename OutScalar, typename sycl_kernel, typename InPtr,
482  typename Range, typename Index, typename... T>
483  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
484  Range thread_range,
485  Index scratchSize,
486  T... var) const {
487  auto kernel_functor = [=](cl::sycl::handler &cgh) {
488  // binding the placeholder accessors to a commandgroup handler
489  inptr.bind(cgh);
490  typedef cl::sycl::accessor<OutScalar, 1,
491  cl::sycl::access::mode::read_write,
492  cl::sycl::access::target::local>
493  LocalAccessor;
494 
495  LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
496  cgh.parallel_for(
497 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
498  program().template get_kernel<sycl_kernel>(),
499 #endif
500  thread_range, sycl_kernel(scratch, inptr, var...));
501  };
502  cl::sycl::event e;
503  EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
504  async_synchronize(e);
505  }
506 
507 
508  EIGEN_STRONG_INLINE void synchronize() const {
509 #ifdef EIGEN_EXCEPTIONS
510  m_queue.wait_and_throw();
511 #else
512  m_queue.wait();
513 #endif
514  }
515 
516 
517  EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
518  set_latest_event(e);
519 #ifndef EIGEN_SYCL_ASYNC_EXECUTION
520  synchronize();
521 #endif
522  }
523 
524  template <typename Index>
525  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
526  Index &rng, Index &GRange) const {
527  tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
528  tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
529  EIGEN_SYCL_LOCAL_THREAD_DIM1),
530  static_cast<Index>(tileSize));
531  rng = n;
532  if (rng == 0) rng = static_cast<Index>(1);
533  GRange = rng;
534  if (tileSize > GRange)
535  tileSize = GRange;
536  else if (GRange > tileSize) {
537  Index xMode = static_cast<Index>(GRange % tileSize);
538  if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
539  }
540  }
541 
544  template <typename Index>
545  EIGEN_STRONG_INLINE void parallel_for_setup(
546  const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
547  cl::sycl::range<2> &local_range) const {
548  std::array<Index, 2> input_range = input_dim;
549  Index max_workgroup_Size =
550  static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
551  max_workgroup_Size =
552  std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
553  EIGEN_SYCL_LOCAL_THREAD_DIM1),
554  static_cast<Index>(max_workgroup_Size));
555  Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
556  local_range[1] =
557  static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
558  input_range[1] = input_dim[1];
559  if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
560  global_range[1] = input_range[1];
561  if (local_range[1] > global_range[1])
562  local_range[1] = global_range[1];
563  else if (global_range[1] > local_range[1]) {
564  Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
565  if (xMode != 0)
566  global_range[1] += static_cast<Index>(local_range[1] - xMode);
567  }
568  local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
569  input_range[0] = input_dim[0];
570  if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
571  global_range[0] = input_range[0];
572  if (local_range[0] > global_range[0])
573  local_range[0] = global_range[0];
574  else if (global_range[0] > local_range[0]) {
575  Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
576  if (xMode != 0)
577  global_range[0] += static_cast<Index>(local_range[0] - xMode);
578  }
579  }
580 
583  template <typename Index>
584  EIGEN_STRONG_INLINE void parallel_for_setup(
585  const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
586  cl::sycl::range<3> &local_range) const {
587  std::array<Index, 3> input_range = input_dim;
588  Index max_workgroup_Size =
589  static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
590  max_workgroup_Size =
591  std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
592  EIGEN_SYCL_LOCAL_THREAD_DIM1),
593  static_cast<Index>(max_workgroup_Size));
594  Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
595  local_range[2] =
596  static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
597  input_range[2] = input_dim[2];
598  if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
599  global_range[2] = input_range[2];
600  if (local_range[2] > global_range[2])
601  local_range[2] = global_range[2];
602  else if (global_range[2] > local_range[2]) {
603  Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
604  if (xMode != 0)
605  global_range[2] += static_cast<Index>(local_range[2] - xMode);
606  }
607  pow_of_2 = static_cast<Index>(
608  std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
609  local_range[1] =
610  static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
611  input_range[1] = input_dim[1];
612  if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
613  global_range[1] = input_range[1];
614  if (local_range[1] > global_range[1])
615  local_range[1] = global_range[1];
616  else if (global_range[1] > local_range[1]) {
617  Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
618  if (xMode != 0)
619  global_range[1] += static_cast<Index>(local_range[1] - xMode);
620  }
621  local_range[0] = static_cast<Index>(max_workgroup_Size /
622  (local_range[1] * local_range[2]));
623  input_range[0] = input_dim[0];
624  if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
625  global_range[0] = input_range[0];
626  if (local_range[0] > global_range[0])
627  local_range[0] = global_range[0];
628  else if (global_range[0] > local_range[0]) {
629  Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
630  if (xMode != 0)
631  global_range[0] += static_cast<Index>(local_range[0] - xMode);
632  }
633  }
634 
635  EIGEN_STRONG_INLINE bool has_local_memory() const {
636 #if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
637  return false;
638 #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
639  return true;
640 #else
641  return m_device_info.local_mem_type ==
642  cl::sycl::info::local_mem_type::local;
643 #endif
644  }
645 
646  EIGEN_STRONG_INLINE unsigned long max_buffer_size() const {
647  return m_device_info.max_mem_alloc_size;
648  }
649 
650  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
651  return m_device_info.max_compute_units;
652  }
653 
654  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
655  return m_device_info.max_work_group_size;
656  }
657 
658  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
659  return m_device_info.max_work_item_sizes;
660  }
661 
663  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
664 
665  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
666  // OpenCL doesnot have such concept
667  return 2;
668  }
669 
670  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
671  return m_device_info.local_mem_size;
672  }
673 
674  // This function returns the nearest power of 2 Work-group size which is <=
675  // maximum device workgroup size.
676  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
677  return getPowerOfTwo(m_device_info.max_work_group_size, false);
678  }
679 
680  EIGEN_STRONG_INLINE std::string getPlatformName() const {
681  return m_device_info.platform_name;
682  }
683 
684  EIGEN_STRONG_INLINE std::string getDeviceName() const {
685  return m_device_info.device_name;
686  }
687 
688  EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
689  return m_device_info.device_vendor;
690  }
691 
692  // This function returns the nearest power of 2
693  // if roundup is true returns result>=wgsize
694  // else it return result <= wgsize
695  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
696  if (roundUp) --wGSize;
697  wGSize |= (wGSize >> 1);
698  wGSize |= (wGSize >> 2);
699  wGSize |= (wGSize >> 4);
700  wGSize |= (wGSize >> 8);
701  wGSize |= (wGSize >> 16);
702 #if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
703  wGSize |= (wGSize >> 32);
704 #endif
705  return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
706  }
707 
708  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
709 
710  // This function checks if the runtime recorded an error for the
711  // underlying stream device.
712  EIGEN_STRONG_INLINE bool ok() const {
713  if (!exception_caught_) {
714  synchronize();
715  }
716  return !exception_caught_;
717  }
718 
719  EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
720 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
721  std::lock_guard<std::mutex> lock(event_mutex_);
722  return latest_events_[std::this_thread::get_id()];
723 #else
724  eigen_assert(false);
725  return cl::sycl::event();
726 #endif
727  }
728 
729  // destructor
730  ~QueueInterface() {
731  pMapper.clear();
732 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
733  scratch_buffers.clear();
734 #endif
735  }
736 
737  protected:
738  EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
739 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
740  std::lock_guard<std::mutex> lock(event_mutex_);
741  latest_events_[std::this_thread::get_id()] = e;
742 #else
743  EIGEN_UNUSED_VARIABLE(e);
744 #endif
745  }
746 
747  void synchronize_and_callback(cl::sycl::event e,
748  const std::function<void()> &callback) const {
749  set_latest_event(e);
750  if (callback) {
751  auto callback_ = [=]() {
752 #ifdef EIGEN_EXCEPTIONS
753  cl::sycl::event(e).wait_and_throw();
754 #else
755  cl::sycl::event(e).wait();
756 #endif
757  callback();
758  };
759  m_thread_pool.Schedule(std::move(callback_));
760  } else {
761 #ifdef EIGEN_EXCEPTIONS
762  m_queue.wait_and_throw();
763 #else
764  m_queue.wait();
765 #endif
766  }
767  }
768 
769  bool sycl_async_handler(cl::sycl::exception_list exceptions) const {
770  bool exception_caught = false;
771  for (const auto &e : exceptions) {
772  if (e) {
773  exception_caught = true;
774  EIGEN_THROW_X(e);
775  }
776  }
777  return exception_caught;
778  }
779 
781  bool exception_caught_ = false;
782 
783  mutable std::mutex pmapper_mutex_;
784 
785 #ifdef EIGEN_SYCL_STORE_LATEST_EVENT
786  mutable std::mutex event_mutex_;
787  mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
788 #endif
789 
794  mutable TensorSycl::internal::PointerMapper pMapper;
795 #ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
796  mutable std::unordered_set<void *> scratch_buffers;
797 #endif
798  mutable cl::sycl::queue m_queue;
800 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
801  mutable cl::sycl::program m_prog;
802 #endif
803 
806  mutable Eigen::ThreadPool m_thread_pool;
807 
808  const TensorSycl::internal::SyclDeviceInfo m_device_info;
809 };
810 
811 struct SyclDeviceBase {
814  const QueueInterface *m_queue_stream;
815  explicit SyclDeviceBase(const QueueInterface *queue_stream)
816  : m_queue_stream(queue_stream) {}
817  EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const {
818  return m_queue_stream;
819  }
820 };
821 
822 // Here is a sycl device struct which accept the sycl queue interface
823 // as an input
824 struct SyclDevice : public SyclDeviceBase {
825  explicit SyclDevice(const QueueInterface *queue_stream)
826  : SyclDeviceBase(queue_stream) {}
827 
828  // this is the accessor used to construct the evaluator
829  template <cl::sycl::access::mode AcMd, typename T>
830  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
831  get_range_accessor(const void *ptr) const {
832  return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
833  }
834 
835  // get sycl accessor
836  template <cl::sycl::access::mode AcMd>
837  EIGEN_STRONG_INLINE cl::sycl::accessor<
838  buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
839  get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
840  return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
841  }
842 
844  EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
845  const void *ptr) const {
846  return queue_stream()->get_sycl_buffer(ptr);
847  }
848 
851  template <typename Index>
852  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
853  Index &rng, Index &GRange) const {
854  queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
855  }
856 
859  template <typename Index>
860  EIGEN_STRONG_INLINE void parallel_for_setup(
861  const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
862  cl::sycl::range<2> &local_range) const {
863  queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
864  }
865 
868  template <typename Index>
869  EIGEN_STRONG_INLINE void parallel_for_setup(
870  const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
871  cl::sycl::range<3> &local_range) const {
872  queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
873  }
874 
876  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
877  return queue_stream()->allocate(num_bytes);
878  }
879 
880  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
881  return queue_stream()->allocate_temp(num_bytes);
882  }
883 
885  EIGEN_STRONG_INLINE void deallocate(void *p) const {
886  queue_stream()->deallocate(p);
887  }
888 
889  EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
890  queue_stream()->deallocate_temp(buffer);
891  }
892  template <cl::sycl::access::mode AcMd, typename T>
893  EIGEN_STRONG_INLINE void deallocate_temp(
894  const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
895  queue_stream()->deallocate_temp(buffer);
896  }
897  EIGEN_STRONG_INLINE void deallocate_all() const {
898  queue_stream()->deallocate_all();
899  }
900 
901  template <typename data_t>
902  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
903  cl::sycl::access::mode::read_write, data_t>
904  get(data_t *data) const {
905  return queue_stream()->get(data);
906  }
907  template <typename data_t>
908  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
909  TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
910  data_t>
911  data) const {
912  return queue_stream()->get(data);
913  }
914 
916  EIGEN_STRONG_INLINE void *attach_buffer(
917  cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
918  return queue_stream()->attach_buffer(buf);
919  }
921  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
922  queue_stream()->detach_buffer(p);
923  }
924  EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
925  return queue_stream()->get_offset(ptr);
926  }
927 
928  // some runtime conditions that can be applied here
929  EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
930 
932  template <typename Index>
933  EIGEN_STRONG_INLINE void memcpyHostToDevice(
934  Index *dst, const Index *src, size_t n,
935  std::function<void()> callback = {}) const {
936  queue_stream()->memcpyHostToDevice(dst, src, n, callback);
937  }
939  template <typename Index>
940  EIGEN_STRONG_INLINE void memcpyDeviceToHost(
941  void *dst, const Index *src, size_t n,
942  std::function<void()> callback = {}) const {
943  queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
944  }
946  template <typename Index>
947  EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
948  queue_stream()->memcpy(dst, src, n);
949  }
951  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
952  queue_stream()->memset(data, c, n);
953  }
955  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
956  return queue_stream()->sycl_queue();
957  }
958 #ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
959  EIGEN_STRONG_INLINE cl::sycl::program &program() const {
960  return queue_stream()->program();
961  }
962 #endif
963 
964  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
965 
966  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
967  // We won't try to take advantage of the l2 cache for the time being, and
968  // there is no l3 cache on sycl devices.
969  return firstLevelCacheSize();
970  }
971  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
972  return queue_stream()->getNumSyclMultiProcessors();
973  }
974  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
975  return queue_stream()->maxSyclThreadsPerBlock();
976  }
977  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
978  return queue_stream()->maxWorkItemSizes();
979  }
980  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
981  // OpenCL doesnot have such concept
982  return queue_stream()->maxSyclThreadsPerMultiProcessor();
983  }
984  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
985  return queue_stream()->sharedMemPerBlock();
986  }
987  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
988  return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
989  }
990 
991  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
992  return queue_stream()->getPowerOfTwo(val, roundUp);
993  }
995  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
996  return queue_stream()->majorDeviceVersion();
997  }
998 
999  EIGEN_STRONG_INLINE void synchronize() const {
1000  queue_stream()->synchronize();
1001  }
1002  EIGEN_STRONG_INLINE void async_synchronize(
1003  cl::sycl::event e = cl::sycl::event()) const {
1004  queue_stream()->async_synchronize(e);
1005  }
1006  EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
1007  return queue_stream()->get_latest_event();
1008  }
1009 
1010  // This function checks if the runtime recorded an error for the
1011  // underlying stream device.
1012  EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
1013 
1014  EIGEN_STRONG_INLINE bool has_local_memory() const {
1015  return queue_stream()->has_local_memory();
1016  }
1017  EIGEN_STRONG_INLINE long max_buffer_size() const {
1018  return queue_stream()->max_buffer_size();
1019  }
1020  EIGEN_STRONG_INLINE std::string getPlatformName() const {
1021  return queue_stream()->getPlatformName();
1022  }
1023  EIGEN_STRONG_INLINE std::string getDeviceName() const {
1024  return queue_stream()->getDeviceName();
1025  }
1026  EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
1027  return queue_stream()->getDeviceVendor();
1028  }
1029  template <typename OutScalar, typename KernelType, typename... T>
1030  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
1031  queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
1032  var...);
1033  }
1034  template <typename OutScalar, typename KernelType, typename... T>
1035  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
1036  queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
1037  var...);
1038  }
1039 
1040  template <typename OutScalar, typename KernelType, typename... T>
1041  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
1042  queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
1043  var...);
1044  }
1045 };
1046 } // end namespace Eigen
1047 
1048 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index