Please, help us to better know about our user community by answering the following short survey: https://forms.gle/wpyrxWi18ox9Z5ae9
TensorBlock.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // This Source Code Form is subject to the terms of the Mozilla
5 // Public License v. 2.0. If a copy of the MPL was not distributed
6 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 
8 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
9 #define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
10 
11 namespace Eigen {
12 namespace internal {
13 
14 // -------------------------------------------------------------------------- //
15 // Forward declarations for templates defined below.
16 template <typename Scalar, typename IndexType, int NumDims, int Layout>
17 class TensorBlockIO;
18 
19 // -------------------------------------------------------------------------- //
20 // Helper function to compute strides for densely stored buffer of given
21 // dimensions.
22 
23 // TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
24 // this function instead everywhere.
25 template <int Layout, typename IndexType, int NumDims>
26 EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
27  const DSizes<IndexType, NumDims>& dimensions) {
28  DSizes<IndexType, NumDims> strides;
29  if (NumDims == 0) return strides;
30 
31  // TODO(ezhulenev): Use templates to unroll this loop (similar to
32  // h_array_reduce in CXX11meta.h)? Benchmark it.
33  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
34  strides[0] = 1;
35  for (int i = 1; i < NumDims; ++i) {
36  strides[i] = strides[i - 1] * dimensions[i - 1];
37  }
38  } else {
39  strides[NumDims - 1] = 1;
40  for (int i = NumDims - 2; i >= 0; --i) {
41  strides[i] = strides[i + 1] * dimensions[i + 1];
42  }
43  }
44 
45  return strides;
46 }
47 
48 template <int Layout, typename IndexType, size_t NumDims>
49 EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
50  const Eigen::array<IndexType, NumDims>& dimensions) {
51  return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
52 }
53 
54 template <int Layout, std::ptrdiff_t... Indices>
55 EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
56  const Sizes<Indices...>& sizes) {
57  return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
58 }
59 
60 // -------------------------------------------------------------------------- //
61 
62 // Tensor block shape type defines what are the shape preference for the blocks
63 // extracted from the larger tensor.
64 //
65 // Example: blocks of 100 elements from the large 100x100 tensor:
66 // - tensor: 100x100
67 // - target_block_size: 100
68 //
69 // TensorBlockShapeType:
70 // - kUniformAllDims: 100 blocks of size 10x10
71 // - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
72 // or row major layout)
73 enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
74 
75 struct TensorBlockResourceRequirements {
76  TensorBlockShapeType shape_type; // target block shape
77  size_t size; // target block size
78  TensorOpCost cost_per_coeff; // cost of computing a single block element
79 
80 #ifdef EIGEN_HIPCC
81  // For HIPCC, we need to explicitly declare as a "device fun", the constructor
82  // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
83  // errors out complaining about the lack of a matching constructor
84  EIGEN_DEVICE_FUNC
85  TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_,
86  TensorOpCost cost_)
87  : shape_type(shape_type_), size(size_), cost_per_coeff(cost_)
88  {}
89 #endif
90 
91  template <typename Scalar>
92  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
93  TensorBlockShapeType shape_type, size_t size_in_bytes,
94  TensorOpCost cost) {
95  const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
96  return {shape_type, size, cost};
97  }
98 
99  template <typename Scalar>
100  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
101  TensorBlockShapeType shape_type, size_t size_in_bytes) {
102  // This default cost per coefficient is valid for most materialized tensor
103  // block evaluation implementations, because they typically just read
104  // coefficients from the underlying tensor storage, and write to the tensor
105  // block buffer (scratch or destination memory, reads and writes have linear
106  // access pattern). We ignore the fixed cost of block evaluation, because in
107  // practice it should negligible.
108  //
109  // Lazy block evaluation adds the cost of calling a functor for each
110  // coefficient.
111  //
112  // All non-trivial block evaluation implementations must provide their own
113  // cost approximation (e.g. shuffling inner dimension has a much higher cost
114  // because it reads memory randomly, although the total number of moved
115  // bytes is the same).
116  return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
117  {/*bytes_loaded=*/sizeof(Scalar),
118  /*bytes_stored=*/sizeof(Scalar),
119  /*compute_cycles=*/0});
120  }
121 
122  template <typename Scalar>
123  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(
124  size_t size_in_bytes) {
125  return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
126  size_in_bytes);
127  }
128 
129  template <typename Scalar>
130  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(
131  size_t size_in_bytes) {
132  return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
133  size_in_bytes);
134  }
135 
136  EIGEN_DEVICE_FUNC
137  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
138  merge(const TensorBlockResourceRequirements& lhs,
139  const TensorBlockResourceRequirements& rhs) {
140  return {merge(lhs.shape_type, rhs.shape_type), // shape_type
141  merge(lhs.size, rhs.size), // size
142  merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff
143  }
144 
145  EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
146  TensorOpCost cost) {
147  cost_per_coeff += cost;
148  return *this;
149  }
150 
151  // This is a resource requirement that should be returned from expressions
152  // that do not have any block evaluation preference (e.g. default tensor
153  // expression with raw buffer access).
154  EIGEN_DEVICE_FUNC
155  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
156  return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
157  }
158 
159  private:
160  using Requirements = TensorBlockResourceRequirements;
161 
162  EIGEN_DEVICE_FUNC
163  static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
164  return numext::maxi(lhs_size, rhs_size);
165  }
166 
167  EIGEN_DEVICE_FUNC
168  static EIGEN_STRONG_INLINE TensorBlockShapeType
169  merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {
170  return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
171  rhs == TensorBlockShapeType::kSkewedInnerDims)
172  ? TensorBlockShapeType::kSkewedInnerDims
173  : TensorBlockShapeType::kUniformAllDims;
174  }
175 
176  EIGEN_DEVICE_FUNC
177  static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
178  TensorOpCost rhs_cost) {
179  return lhs_cost + rhs_cost;
180  }
181 };
182 
183 // -------------------------------------------------------------------------- //
184 // TensorBlockDescriptor specifies a block offset within a tensor and the block
185 // sizes along each of the tensor dimensions.
186 
187 template <int NumDims, typename IndexType = Eigen::Index>
188 class TensorBlockDescriptor {
189  public:
190  typedef DSizes<IndexType, NumDims> Dimensions;
191 
192  // If we evaluate a Tensor assignment, and expression on the left, already has
193  // a memory buffer, then we might do performance optimization, and evaluate
194  // the root expression directly into the final output memory. Some time it's
195  // possible to reuse it for materializing subexpressions inside an expression
196  // tree, to to avoid dynamic memory allocation.
197  //
198  // The pointer type of the underlying storage is erased, because passing
199  // Scalar type through all the expression evaluation layers is way too many
200  // templates. In practice destination buffer type should always match the
201  // evaluated expression scalar type.
202  class DestinationBuffer {
203  public:
204  enum DestinationBufferKind : int {
205  // The above explicit specification of "int" as the enum basetype is
206  // needed to get around a HIPCC link error ("the field type is not
207  // amp-compatible")
208  // which is issued for class members with the enum type.
209  // TODO(rocm):
210  // remove the "int" basetype once HIPCC has been fixed to not error out
211  // in the above scenario.
212 
213  // Destination buffer is not defined (`m_data` == nullptr).
214  kEmpty,
215 
216  // Tensor block defined by an owning tensor block descriptor can fit
217  // contiguously into the destination buffer. In this case it's safe to
218  // materialize tensor block in the destination buffer, wrap it in a
219  // TensorMap, and use to build Eigen expression on top of it.
220  kContiguous,
221 
222  // Destination buffer strides do not match strides of the contiguously
223  // stored block, and it's impossible to define a TensorMap over this
224  // buffer. However if we are evaluating a root of an expression tree, we
225  // still can materialize an output into this destination, because we can
226  // guarantee that no one will ever access it through block API.
227  //
228  // In theory it is possible to build valid TensorStriding<TensorMap>
229  // expression on top of this destination buffer, however it has
230  // inefficient coeff/packet access, and defeats the purpose of fast block
231  // evaluation API.
232  kStrided
233  };
234 
235  template <typename Scalar>
236  Scalar* data() const {
237  eigen_assert(m_data_type_size == sizeof(Scalar));
238  return static_cast<Scalar*>(m_data);
239  }
240 
241  const Dimensions& strides() const { return m_strides; }
242  const DestinationBufferKind& kind() const { return m_kind; }
243 
244  private:
245  friend class TensorBlockDescriptor;
246 
247  DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
248 
249  template <typename Scalar>
250  DestinationBuffer(Scalar* data, const Dimensions& strides,
251  DestinationBufferKind kind)
252  : m_data(static_cast<void*>(data)),
253  m_data_type_size(sizeof(Scalar)),
254  m_strides(strides),
255  m_kind(kind) {}
256 
257  template <int Layout, typename Scalar>
258  static DestinationBuffer make(const TensorBlockDescriptor& desc,
259  Scalar* data, const Dimensions& strides) {
260  return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
261  }
262 
263  template <int Layout>
264  static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
265  const Dimensions& strides) {
266  const Dimensions& desc_dims = desc.dimensions();
267  const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
268  for (int i = 0; i < NumDims; ++i) {
269  if (desc_dims[i] == 1) continue;
270  if (desc_strides[i] != strides[i]) return kStrided;
271  }
272  return kContiguous;
273  }
274 
275  // Storage pointer is type erased, to reduce template bloat, but we still
276  // keep the size of the underlying element type for error checking.
277  void* m_data;
278  size_t m_data_type_size;
279 
280  // Destination buffer dimensions always match the dimensions of a tensor
281  // block descriptor it belongs to, however strides might be different.
282  Dimensions m_strides;
283 
284  DestinationBufferKind m_kind;
285  };
286 
287  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
288  const DestinationBuffer& destination)
289  : m_offset(offset),
290  m_dimensions(dimensions),
291  m_destination(destination) {}
292 
293  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
294  : m_offset(offset),
295  m_dimensions(dimensions),
296  m_destination(DestinationBuffer()) {}
297 
298  IndexType offset() const { return m_offset; }
299  const Dimensions& dimensions() const { return m_dimensions; }
300  IndexType dimension(int index) const { return m_dimensions[index]; }
301  IndexType size() const { return array_prod<IndexType>(m_dimensions); }
302 
303  const DestinationBuffer& destination() const { return m_destination; }
304 
305  template <int Layout, typename Scalar>
306  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
307  eigen_assert(dst_base != NULL);
308  m_destination =
309  DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
310  }
311 
312  template <int Layout, typename Scalar, typename DstStridesIndexType>
313  void AddDestinationBuffer(
314  Scalar* dst_base,
315  const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
316  // DSizes constructor will do index type promotion if it's safe.
317  AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
318  }
319 
320  TensorBlockDescriptor& DropDestinationBuffer() {
321  m_destination.m_data = NULL;
322  m_destination.m_kind = DestinationBuffer::kEmpty;
323  return *this;
324  }
325 
326  bool HasDestinationBuffer() const {
327  return m_destination.kind() != DestinationBuffer::kEmpty;
328  }
329 
330  // Returns a copy of `*this` with updated offset.
331  TensorBlockDescriptor WithOffset(IndexType offset) const {
332  return TensorBlockDescriptor(offset, m_dimensions, m_destination);
333  }
334 
335  private:
336  // Offset and dimensions are immutable after construction. Block descriptor
337  // can only be mutated by adding or dropping destination.
338  const IndexType m_offset;
339  const Dimensions m_dimensions;
340  DestinationBuffer m_destination;
341 };
342 
343 // -------------------------------------------------------------------------- //
344 // TensorBlockMapper is responsible for iterating over the blocks of a tensor.
345 
346 template <int NumDims, int Layout, typename IndexType = Eigen::Index>
347 class TensorBlockMapper {
348  typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
349 
350  public:
351  typedef DSizes<IndexType, NumDims> Dimensions;
352 
353  TensorBlockMapper() = default;
354  TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
355  const TensorBlockResourceRequirements& requirements)
356  : m_tensor_dimensions(dimensions), m_requirements(requirements) {
357  // Compute block dimensions and the total number of blocks.
358  InitializeBlockDimensions();
359  }
360 
361  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {
362  return m_total_block_count;
363  }
364 
365  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {
366  return m_block_dimensions.TotalSize();
367  }
368 
369  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&
370  blockDimensions() const {
371  return m_block_dimensions;
372  }
373 
374  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
375  blockDescriptor(IndexType block_index) const {
376  static const bool isColMajor = Layout == static_cast<int>(ColMajor);
377 
378  IndexType offset = 0;
379  DSizes<IndexType, NumDims> dimensions;
380 
381  if (NumDims == 0) return BlockDescriptor(offset, dimensions);
382 
383  // Iterate outer -> inner dimensions.
384  for (int i = NumDims - 1; i >= 0; --i) {
385  const int dim = isColMajor ? i : NumDims - i - 1;
386 
387  const IndexType idx = block_index / m_block_strides[dim];
388  block_index -= idx * m_block_strides[dim];
389 
390  const IndexType coord = idx * m_block_dimensions[dim];
391  dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
392  m_block_dimensions[dim]);
393  offset += coord * m_tensor_strides[dim];
394  }
395 
396  return {offset, dimensions};
397  }
398 
399  private:
400  void InitializeBlockDimensions() {
401  // Requested block shape and size.
402  const TensorBlockShapeType shape_type = m_requirements.shape_type;
403  IndexType target_block_size =
404  numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
405 
406  IndexType tensor_size = m_tensor_dimensions.TotalSize();
407 
408  // Corner case: one of the dimensions is zero. Logic below is too complex
409  // to handle this case on a general basis, just use unit block size.
410  // Note: we must not yield blocks with zero dimensions (recipe for
411  // overflows/underflows, divisions by zero and NaNs later).
412  if (tensor_size == 0) {
413  for (int i = 0; i < NumDims; ++i) {
414  m_block_dimensions[i] = 1;
415  }
416  m_total_block_count = 0;
417  return;
418  }
419 
420  // If tensor fits into a target block size, evaluate it as a single block.
421  if (tensor_size <= target_block_size) {
422  m_block_dimensions = m_tensor_dimensions;
423  m_total_block_count = 1;
424  // The only valid block index is `0`, and in this case we do not need
425  // to compute real strides for tensor or blocks (see blockDescriptor).
426  for (int i = 0; i < NumDims; ++i) {
427  m_tensor_strides[i] = 0;
428  m_block_strides[i] = 1;
429  }
430  return;
431  }
432 
433  static const bool isColMajor = Layout == static_cast<int>(ColMajor);
434 
435  // Block shape skewed towards inner dimension.
436  if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
437  IndexType coeff_to_allocate = target_block_size;
438 
439  for (int i = 0; i < NumDims; ++i) {
440  const int dim = isColMajor ? i : NumDims - i - 1;
441  m_block_dimensions[dim] =
442  numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
443  coeff_to_allocate = divup(
444  coeff_to_allocate,
445  numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
446  }
447  eigen_assert(coeff_to_allocate == 1);
448 
449  } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
450  // Tensor will not fit within 'target_block_size' budget: calculate tensor
451  // block dimension sizes based on "square" dimension size target.
452  const IndexType dim_size_target = convert_index<IndexType>(
453  std::pow(static_cast<float>(target_block_size),
454  1.0f / static_cast<float>(m_block_dimensions.rank())));
455 
456  for (int i = 0; i < NumDims; ++i) {
457  // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
458  // a multiple of the packet size. Note that reducing
459  // 'block_dim_size' in this manner can increase the number of
460  // blocks, and so will amplify any per-block overhead.
461  m_block_dimensions[i] =
462  numext::mini(dim_size_target, m_tensor_dimensions[i]);
463  }
464 
465  // Add any un-allocated coefficients to inner dimension(s).
466  IndexType total_size = m_block_dimensions.TotalSize();
467  for (int i = 0; i < NumDims; ++i) {
468  const int dim = isColMajor ? i : NumDims - i - 1;
469 
470  if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
471  const IndexType total_size_other_dims =
472  total_size / m_block_dimensions[dim];
473  const IndexType alloc_avail =
474  divup<IndexType>(target_block_size, total_size_other_dims);
475  if (alloc_avail == m_block_dimensions[dim]) {
476  // Insufficient excess coefficients to allocate.
477  break;
478  }
479  m_block_dimensions[dim] =
480  numext::mini(m_tensor_dimensions[dim], alloc_avail);
481  total_size = total_size_other_dims * m_block_dimensions[dim];
482  }
483  }
484 
485  } else {
486  eigen_assert(false); // unknown block shape
487  }
488 
489  eigen_assert(m_block_dimensions.TotalSize() >=
490  numext::mini<IndexType>(target_block_size,
491  m_tensor_dimensions.TotalSize()));
492 
493  // Calculate block counts by dimension and total block count.
494  DSizes<IndexType, NumDims> block_count;
495  for (int i = 0; i < NumDims; ++i) {
496  block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
497  }
498  m_total_block_count = array_prod(block_count);
499 
500  // Calculate block strides (used for enumerating blocks).
501  m_tensor_strides = strides<Layout>(m_tensor_dimensions);
502  m_block_strides = strides<Layout>(block_count);
503  }
504 
505  DSizes<IndexType, NumDims> m_tensor_dimensions;
506  TensorBlockResourceRequirements m_requirements;
507 
508  DSizes<IndexType, NumDims> m_block_dimensions;
509  IndexType m_total_block_count;
510 
511  DSizes<IndexType, NumDims> m_tensor_strides;
512  DSizes<IndexType, NumDims> m_block_strides;
513 };
514 
515 // -------------------------------------------------------------------------- //
516 // TensorBlockScratchAllocator is responsible for allocating temporary buffers
517 // for block evaluation (output or input block materialization). Given that
518 // Eigen expression traversal order is deterministic, all temporary allocations
519 // are happening in the same order, and usually have exactly the same size.
520 // Scratch allocator keeps a trace of all dynamic allocations, and after the
521 // first block evaluation is completed, we should be able to reuse all the
522 // temporary buffers for the next block evaluation.
523 
524 template <typename Device>
525 class TensorBlockScratchAllocator {
526  public:
527  explicit TensorBlockScratchAllocator(const Device& device)
528  : m_device(device), m_allocation_index(0) {}
529 
530  ~TensorBlockScratchAllocator() {
531  for (size_t i = 0; i < m_allocations.size(); ++i) {
532  m_device.deallocate(m_allocations[i].ptr);
533  }
534  }
535 
536  void* allocate(size_t size) {
537  // TODO(ezhulenev): Remove when replaced with inlined vector.
538  if (m_allocations.capacity() == 0) m_allocations.reserve(8);
539 
540  // Check if we already have an existing allocation att current index.
541  const int num_allocations = static_cast<int>(m_allocations.size());
542  const bool has_allocation = m_allocation_index < num_allocations;
543 
544  // Allocation index can't be larger than the number of allocations.
545  eigen_assert(m_allocation_index <= num_allocations);
546 
547  // If we have existing allocation, and its size is larger or equal to
548  // requested size, we do nothing.
549 
550  // If current allocation can't fit requested size, we deallocate it, and
551  // replace with a larger allocation.
552  if (has_allocation && m_allocations[m_allocation_index].size < size) {
553  m_device.deallocate(m_allocations[m_allocation_index].ptr);
554  m_allocations[m_allocation_index].ptr = m_device.allocate(size);
555  m_allocations[m_allocation_index].size = size;
556  }
557 
558  // Make a new allocation if we don't have and existing one.
559  if (!has_allocation) {
560  Allocation allocation;
561  allocation.ptr = m_device.allocate(size);
562  allocation.size = size;
563  m_allocations.push_back(allocation);
564  }
565 
566  eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
567  eigen_assert(m_allocations[m_allocation_index].size >= size);
568 
569  return m_allocations[m_allocation_index++].ptr;
570  }
571 
572  void reset() { m_allocation_index = 0; }
573 
574  private:
575  struct Allocation {
576  void* ptr;
577  size_t size;
578  };
579 
580  const Device& m_device;
581  int m_allocation_index;
582  // TODO(ezhulenev): This should be an inlined vector.
583  std::vector<Allocation> m_allocations;
584 };
585 
586 // -------------------------------------------------------------------------- //
587 // TensorBlockKind represents all possible block kinds, that can be produced by
588 // TensorEvaluator::evalBlock function.
589 enum TensorBlockKind {
590  // Tensor block that is a lazy expression that must be assigned to a
591  // destination using TensorBlockAssign.
592  kExpr,
593 
594  // Tensor block that is a view into a memory buffer owned by an underlying
595  // Tensor expression (e.g. it can be a view into a Tensor buffer).
596  kView,
597 
598  // Tensor block that was materialized in a scratch memory buffer, allocated
599  // with TensorBlockScratchAllocator. This block must be copied to a
600  // destination, similar to a block of `kExpr` type.
601  kMaterializedInScratch,
602 
603  // Tensor block that was materialized directly into the final output memory
604  // buffer. For example if the left side of an assignment is a Tensor, we can
605  // directly materialize the block in the destination memory.
606  //
607  // If strides in the output buffer do not match tensor block strides, the
608  // Tensor expression will be invalid, and should not be used by
609  // TensorBlockAssign or for constructing another block expression.
610  kMaterializedInOutput
611 };
612 
613 // -------------------------------------------------------------------------- //
614 // TensorBlockNotImplemented should be used to defined TensorBlock typedef in
615 // TensorEvaluators that do not support block evaluation.
616 
617 class TensorBlockNotImplemented {
618  public:
619  typedef void XprType;
620 };
621 
622 // -------------------------------------------------------------------------- //
623 // XprScalar extracts Scalar type from the Eigen expressions (if expression type
624 // is not void). It's required to be able to define lazy block expression for
625 // argument types, that do not support block evaluation.
626 
627 template <typename XprType>
628 struct XprScalar {
629  typedef typename XprType::Scalar type;
630 };
631 template <>
632 struct XprScalar<void> {
633  typedef void type;
634 };
635 
636 // -------------------------------------------------------------------------- //
637 // TensorMaterializedBlock is a fully evaluated block of the original tensor,
638 // and XprType is just a TensorMap over the data. This block type is typically
639 // used to materialize blocks of tensor expressions, that can't be efficiently
640 // represented as lazy Tensor expressions with fast coeff/packet operations,
641 // e.g. we materialize all broadcasts into evaluated blocks.
642 //
643 // TensorMaterializedBlock does not own its memory buffer, it's either a memory
644 // buffer that backs the original expression (e.g. block is just a view into a
645 // Tensor), or a memory buffer allocated with scratch allocator, and in this
646 // case the scratch allocator will deallocate it at the end of block based
647 // expression execution.
648 //
649 // If the block was evaluated directly into the output buffer, and strides in
650 // the output buffer do not match block strides, the TensorMap expression will
651 // be invalid, and should never be used in block assignment or any other tensor
652 // expression.
653 
654 template <typename Scalar, int NumDims, int Layout,
655  typename IndexType = Eigen::Index>
656 class TensorMaterializedBlock {
657  public:
658  typedef DSizes<IndexType, NumDims> Dimensions;
659  typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
660 
661  TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
662  const Dimensions& dimensions, bool valid_expr = true)
663  : m_kind(kind),
664  m_data(data),
665  m_dimensions(dimensions),
666  m_expr(m_data, m_dimensions),
667  m_valid_expr(valid_expr) {
668  eigen_assert(m_kind == internal::TensorBlockKind::kView ||
669  m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
670  m_kind == internal::TensorBlockKind::kMaterializedInOutput);
671  }
672 
673  TensorBlockKind kind() const { return m_kind; }
674  // NOTE(ezhulenev): Returning XprType by value like in other block types
675  // causes asan failures. The theory is that XprType::Nested doesn't work
676  // properly for TensorMap.
677  const XprType& expr() const {
678  eigen_assert(m_valid_expr);
679  return m_expr;
680  }
681  const Scalar* data() const { return m_data; }
682  void cleanup() {}
683 
684  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
685 
686  // TensorMaterializedBlock can be backed by different types of storage:
687  //
688  // (1) Contiguous block of memory allocated with scratch allocator.
689  // (2) Contiguous block of memory reused from tensor block descriptor
690  // destination buffer.
691  // (3) Strided block of memory reused from tensor block descriptor
692  // destination buffer.
693  //
694  class Storage {
695  public:
696  Scalar* data() const { return m_data; }
697  const Dimensions& dimensions() const { return m_dimensions; }
698  const Dimensions& strides() const { return m_strides; }
699 
700  TensorMaterializedBlock AsTensorMaterializedBlock() const {
701  return TensorMaterializedBlock(
702  m_materialized_in_output
703  ? internal::TensorBlockKind::kMaterializedInOutput
704  : internal::TensorBlockKind::kMaterializedInScratch,
705  m_data, m_dimensions, !m_strided_storage);
706  }
707 
708  private:
709  friend class TensorMaterializedBlock;
710 
711  Storage(Scalar* data, const Dimensions& dimensions,
712  const Dimensions& strides, bool materialized_in_output,
713  bool strided_storage)
714  : m_data(data),
715  m_dimensions(dimensions),
716  m_strides(strides),
717  m_materialized_in_output(materialized_in_output),
718  m_strided_storage(strided_storage) {}
719 
720  Scalar* m_data;
721  Dimensions m_dimensions;
722  Dimensions m_strides;
723  bool m_materialized_in_output;
724  bool m_strided_storage;
725  };
726 
727  // Creates a storage for materialized block either from the block descriptor
728  // destination buffer, or allocates a new buffer with scratch allocator.
729  template <typename TensorBlockScratch>
730  EIGEN_STRONG_INLINE static Storage prepareStorage(
731  TensorBlockDesc& desc, TensorBlockScratch& scratch,
732  bool allow_strided_storage = false) {
733  // Try to reuse destination as an output block buffer.
734  typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
735 
736  if (desc.destination().kind() == DestinationBuffer::kContiguous) {
737  Scalar* buffer = desc.destination().template data<Scalar>();
738  desc.DropDestinationBuffer();
739  return Storage(buffer, desc.dimensions(),
740  internal::strides<Layout>(desc.dimensions()),
741  /*materialized_in_output=*/true,
742  /*strided_storage=*/false);
743 
744  } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
745  allow_strided_storage) {
746  Scalar* buffer = desc.destination().template data<Scalar>();
747  desc.DropDestinationBuffer();
748  return Storage(buffer, desc.dimensions(), desc.destination().strides(),
749  /*materialized_in_output=*/true, /*strided_storage=*/true);
750 
751  } else {
752  void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
753  return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
754  internal::strides<Layout>(desc.dimensions()),
755  /*materialized_in_output=*/false,
756  /*strided_storage=*/false);
757  }
758  }
759 
760  // Creates a materialized block for the given descriptor from a memory buffer.
761  template <typename DataDimensions, typename TensorBlockScratch>
762  EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
763  const Scalar* data, const DataDimensions& data_dims,
764  TensorBlockDesc& desc, TensorBlockScratch& scratch) {
765  eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
766 
767  // If a tensor block dimensions covers a contiguous block of the underlying
768  // memory, we can skip block buffer memory allocation, and construct a block
769  // from existing `data` memory buffer.
770  //
771  // Example: (RowMajor layout)
772  // data_dims: [11, 12, 13, 14]
773  // desc.dimensions(): [1, 1, 3, 14]
774  //
775  // In this case we can construct a TensorBlock starting at
776  // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
777  static const bool is_col_major = Layout == ColMajor;
778 
779  // Find out how many inner dimensions have a matching size.
780  int num_matching_inner_dims = 0;
781  for (int i = 0; i < NumDims; ++i) {
782  int dim = is_col_major ? i : NumDims - i - 1;
783  if (data_dims[dim] != desc.dimensions()[dim]) break;
784  ++num_matching_inner_dims;
785  }
786 
787  // All the outer dimensions must be of size `1`, except a single dimension
788  // before the matching inner dimension (`3` in the example above).
789  bool can_use_direct_access = true;
790  for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
791  int dim = is_col_major ? i : NumDims - i - 1;
792  if (desc.dimension(dim) != 1) {
793  can_use_direct_access = false;
794  break;
795  }
796  }
797 
798  if (can_use_direct_access) {
799  const Scalar* block_start = data + desc.offset();
800  return TensorMaterializedBlock(internal::TensorBlockKind::kView,
801  block_start, desc.dimensions());
802 
803  } else {
804  // Reuse destination buffer or allocate new buffer with scratch allocator.
805  const Storage storage = prepareStorage(desc, scratch);
806 
807  typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
808  TensorBlockIO;
809  typedef typename TensorBlockIO::Dst TensorBlockIODst;
810  typedef typename TensorBlockIO::Src TensorBlockIOSrc;
811 
812  TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
813  data, desc.offset());
814  TensorBlockIODst dst(storage.dimensions(), storage.strides(),
815  storage.data());
816 
817  TensorBlockIO::Copy(dst, src);
818  return storage.AsTensorMaterializedBlock();
819  }
820  }
821 
822  private:
823  TensorBlockKind m_kind;
824  const Scalar* m_data;
825  Dimensions m_dimensions;
826  XprType m_expr;
827  bool m_valid_expr;
828 };
829 
830 // -------------------------------------------------------------------------- //
831 // TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
832 // functor to the blocks produced by the underlying Tensor expression.
833 
834 template <typename UnaryOp, typename ArgTensorBlock>
835 class TensorCwiseUnaryBlock {
836  static const bool NoArgBlockAccess =
837  internal::is_void<typename ArgTensorBlock::XprType>::value;
838 
839  public:
840  typedef typename conditional<
841  NoArgBlockAccess, void,
842  TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
843  type XprType;
844 
845  typedef typename XprScalar<XprType>::type Scalar;
846 
847  TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
848  : m_arg_block(arg_block), m_functor(functor) {}
849 
850  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
851 
852  XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
853  const Scalar* data() const { return NULL; }
854  void cleanup() { m_arg_block.cleanup(); }
855 
856  private:
857  ArgTensorBlock m_arg_block;
858  UnaryOp m_functor;
859 };
860 
861 // -------------------------------------------------------------------------- //
862 // TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
863 // functor to the blocks produced by the underlying Tensor expression.
864 
865 template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
866 class TensorCwiseBinaryBlock {
867  static const bool NoArgBlockAccess =
868  internal::is_void<typename LhsTensorBlock::XprType>::value ||
869  internal::is_void<typename RhsTensorBlock::XprType>::value;
870 
871  public:
872  typedef typename conditional<
873  NoArgBlockAccess, void,
874  TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
875  const typename RhsTensorBlock::XprType> >::type
876  XprType;
877 
878  typedef typename XprScalar<XprType>::type Scalar;
879 
880  TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
881  const RhsTensorBlock& right_block,
882  const BinaryOp& functor)
883  : m_left_block(left_block),
884  m_right_block(right_block),
885  m_functor(functor) {}
886 
887  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
888 
889  XprType expr() const {
890  return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
891  }
892 
893  const Scalar* data() const { return NULL; }
894 
895  void cleanup() {
896  m_left_block.cleanup();
897  m_right_block.cleanup();
898  }
899 
900  private:
901  LhsTensorBlock m_left_block;
902  RhsTensorBlock m_right_block;
903  BinaryOp m_functor;
904 };
905 
906 // -------------------------------------------------------------------------- //
907 // TensorUnaryExprBlock is a lazy tensor expression block that can construct
908 // an arbitrary tensor expression from a block of the underlying type (this is a
909 // generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
910 
911 template <typename BlockFactory, typename ArgTensorBlock>
912 class TensorUnaryExprBlock {
913  typedef typename ArgTensorBlock::XprType ArgXprType;
914  static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
915 
916  public:
917  typedef typename conditional<
918  NoArgBlockAccess, void,
919  typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
920 
921  typedef typename XprScalar<XprType>::type Scalar;
922 
923  TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
924  const BlockFactory& factory)
925  : m_arg_block(arg_block), m_factory(factory) {}
926 
927  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
928  XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
929  const Scalar* data() const { return NULL; }
930  void cleanup() { m_arg_block.cleanup(); }
931 
932  private:
933  ArgTensorBlock m_arg_block;
934  BlockFactory m_factory;
935 };
936 
937 // -------------------------------------------------------------------------- //
938 // TensorTernaryExprBlock is a lazy tensor expression block that can construct
939 // an arbitrary tensor expression from three blocks of the underlying type.
940 
941 template <typename BlockFactory, typename Arg1TensorBlock,
942  typename Arg2TensorBlock, typename Arg3TensorBlock>
943 class TensorTernaryExprBlock {
944  typedef typename Arg1TensorBlock::XprType Arg1XprType;
945  typedef typename Arg2TensorBlock::XprType Arg2XprType;
946  typedef typename Arg3TensorBlock::XprType Arg3XprType;
947 
948  static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
949  internal::is_void<Arg2XprType>::value ||
950  internal::is_void<Arg3XprType>::value;
951 
952  public:
953  typedef typename conditional<
954  NoArgBlockAccess, void,
955  typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
956  Arg3XprType>::type>::type XprType;
957 
958  typedef typename XprScalar<XprType>::type Scalar;
959 
960  TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
961  const Arg2TensorBlock& arg2_block,
962  const Arg3TensorBlock& arg3_block,
963  const BlockFactory& factory)
964  : m_arg1_block(arg1_block),
965  m_arg2_block(arg2_block),
966  m_arg3_block(arg3_block),
967  m_factory(factory) {}
968 
969  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
970  XprType expr() const {
971  return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
972  m_arg3_block.expr());
973  }
974  const Scalar* data() const { return NULL; }
975  void cleanup() {
976  m_arg1_block.cleanup();
977  m_arg2_block.cleanup();
978  m_arg3_block.cleanup();
979  }
980 
981  private:
982  Arg1TensorBlock m_arg1_block;
983  Arg2TensorBlock m_arg2_block;
984  Arg3TensorBlock m_arg3_block;
985  BlockFactory m_factory;
986 };
987 
988 // -------------------------------------------------------------------------- //
989 // StridedLinearBufferCopy provides a method to copy data between two linear
990 // buffers with different strides, with optimized paths for scatter/gather.
991 
992 template <typename Scalar, typename IndexType>
993 class StridedLinearBufferCopy {
994  typedef typename packet_traits<Scalar>::type Packet;
995  enum {
996  Vectorizable = packet_traits<Scalar>::Vectorizable,
997  PacketSize = packet_traits<Scalar>::size
998  };
999 
1000  public:
1001  // Specifying linear copy kind statically gives ~30% speedup for small sizes.
1002  enum class Kind {
1003  Linear = 0, // src_stride == 1 && dst_stride == 1
1004  Scatter = 1, // src_stride == 1 && dst_stride != 1
1005  FillLinear = 2, // src_stride == 0 && dst_stride == 1
1006  FillScatter = 3, // src_stride == 0 && dst_stride != 1
1007  Gather = 4, // dst_stride == 1
1008  Random = 5 // everything else
1009  };
1010 
1011  struct Dst {
1012  Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
1013 
1014  IndexType offset;
1015  IndexType stride;
1016  Scalar* data;
1017  };
1018 
1019  struct Src {
1020  Src(IndexType o, IndexType s, const Scalar* d)
1021  : offset(o), stride(s), data(d) {}
1022 
1023  IndexType offset;
1024  IndexType stride;
1025  const Scalar* data;
1026  };
1027 
1028  template <typename StridedLinearBufferCopy::Kind kind>
1029  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
1030  const Src& src,
1031  const size_t count) {
1032  Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
1033  src.data);
1034  }
1035 
1036  private:
1037  template <typename StridedLinearBufferCopy::Kind kind>
1038  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
1039  const IndexType count, const IndexType dst_offset,
1040  const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
1041  const IndexType src_offset, const IndexType src_stride,
1042  const Scalar* EIGEN_RESTRICT src_data) {
1043  const Scalar* src = &src_data[src_offset];
1044  Scalar* dst = &dst_data[dst_offset];
1045 
1046  if (!Vectorizable) {
1047  for (Index i = 0; i < count; ++i) {
1048  dst[i * dst_stride] = src[i * src_stride];
1049  }
1050  return;
1051  }
1052 
1053  const IndexType vectorized_size = count - PacketSize;
1054  IndexType i = 0;
1055 
1056  if (kind == StridedLinearBufferCopy::Kind::Linear) {
1057  // ******************************************************************** //
1058  // Linear copy from `src` to `dst`.
1059  const IndexType unrolled_size = count - 4 * PacketSize;
1060  eigen_assert(src_stride == 1 && dst_stride == 1);
1061  for (; i <= unrolled_size; i += 4 * PacketSize) {
1062  for (int j = 0; j < 4; ++j) {
1063  Packet p = ploadu<Packet>(src + i + j * PacketSize);
1064  pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1065  }
1066  }
1067  for (; i <= vectorized_size; i += PacketSize) {
1068  Packet p = ploadu<Packet>(src + i);
1069  pstoreu<Scalar, Packet>(dst + i, p);
1070  }
1071  for (; i < count; ++i) {
1072  dst[i] = src[i];
1073  }
1074  // ******************************************************************** //
1075  } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
1076  // Scatter from `src` to `dst`.
1077  eigen_assert(src_stride == 1 && dst_stride != 1);
1078  for (; i <= vectorized_size; i += PacketSize) {
1079  Packet p = ploadu<Packet>(src + i);
1080  pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1081  }
1082  for (; i < count; ++i) {
1083  dst[i * dst_stride] = src[i];
1084  }
1085  // ******************************************************************** //
1086  } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
1087  // Fill `dst` with value at `*src`.
1088  eigen_assert(src_stride == 0 && dst_stride == 1);
1089  const IndexType unrolled_size = count - 4 * PacketSize;
1090  Packet p = pload1<Packet>(src);
1091  for (; i <= unrolled_size; i += 4 * PacketSize) {
1092  for (int j = 0; j < 4; ++j) {
1093  pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1094  }
1095  }
1096  for (; i <= vectorized_size; i += PacketSize) {
1097  pstoreu<Scalar, Packet>(dst + i, p);
1098  }
1099  for (; i < count; ++i) {
1100  dst[i] = *src;
1101  }
1102  // ******************************************************************** //
1103  } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
1104  // Scatter `*src` into `dst`.
1105  eigen_assert(src_stride == 0 && dst_stride != 1);
1106  Packet p = pload1<Packet>(src);
1107  for (; i <= vectorized_size; i += PacketSize) {
1108  pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1109  }
1110  for (; i < count; ++i) {
1111  dst[i * dst_stride] = *src;
1112  }
1113  // ******************************************************************** //
1114  } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
1115  // Gather from `src` into `dst`.
1116  eigen_assert(dst_stride == 1);
1117  for (; i <= vectorized_size; i += PacketSize) {
1118  Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
1119  pstoreu<Scalar, Packet>(dst + i, p);
1120  }
1121  for (; i < count; ++i) {
1122  dst[i] = src[i * src_stride];
1123  }
1124  // ******************************************************************** //
1125  } else if (kind == StridedLinearBufferCopy::Kind::Random) {
1126  // Random.
1127  for (; i < count; ++i) {
1128  dst[i * dst_stride] = src[i * src_stride];
1129  }
1130  } else {
1131  eigen_assert(false);
1132  }
1133  }
1134 };
1135 
1136 // -------------------------------------------------------------------------- //
1137 // TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
1138 // It's possible to specify src->dst dimension mapping for the copy operation.
1139 // Dimensions of `dst` specify how many elements have to be copied, for the
1140 // `src` we need to know only stride to navigate through source memory buffer.
1141 
1142 template <typename Scalar, typename IndexType, int NumDims, int Layout>
1143 class TensorBlockIO {
1144  static const bool IsColMajor = (Layout == ColMajor);
1145 
1146  typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
1147 
1148  public:
1149  typedef DSizes<IndexType, NumDims> Dimensions;
1150  typedef DSizes<int, NumDims> DimensionsMap;
1151 
1152  struct Dst {
1153  Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
1154  IndexType dst_offset = 0)
1155  : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
1156 
1157  Dimensions dims;
1158  Dimensions strides;
1159  Scalar* data;
1160  IndexType offset;
1161  };
1162 
1163  struct Src {
1164  Src(const Dimensions& src_strides, const Scalar* src,
1165  IndexType src_offset = 0)
1166  : strides(src_strides), data(src), offset(src_offset) {}
1167 
1168  Dimensions strides;
1169  const Scalar* data;
1170  IndexType offset;
1171  };
1172 
1173  // Copies data to `dst` from `src`, using provided dimensions mapping:
1174  //
1175  // src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
1176  //
1177  // Returns the number of copied elements.
1178  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
1179  const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
1180  // Copy single scalar value from `src` to `dst`.
1181  if (NumDims == 0) {
1182  *(dst.data + dst.offset) = *(src.data + src.offset);
1183  return 1;
1184  }
1185 
1186  // Both `dst` and `src` must have contiguous innermost dimension. We also
1187  // accept the special case with stride '0', because it's used as a trick to
1188  // implement broadcasting.
1189  {
1190  int inner_dim = IsColMajor ? 0 : NumDims - 1;
1191  EIGEN_UNUSED_VARIABLE(inner_dim);
1192  eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
1193  eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
1194  }
1195 
1196  // Give a shorter name to `dst_to_src_dim_map`.
1197  const DimensionsMap& dim_map = dst_to_src_dim_map;
1198 
1199  // Do not squeeze reordered inner dimensions.
1200  int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
1201 
1202  // NOTE: We find the innermost dimension (contiguous in memory) in the dst
1203  // block, and we write data linearly into that dimension, reading it from
1204  // the src. If dimensions are reordered, we might end up reading data from
1205  // the src with `stride != 1`.
1206  //
1207  // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
1208  // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
1209 
1210  // Find the innermost dimension in the dst whose size is not 1. This is the
1211  // effective inner dim.
1212  int num_size_one_inner_dims = 0;
1213  for (int i = 0; i < num_squeezable_dims; ++i) {
1214  const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1215  if (dst.dims[dst_dim] != 1) break;
1216  num_size_one_inner_dims++;
1217  }
1218 
1219  // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
1220  if (num_size_one_inner_dims == NumDims) {
1221  *(dst.data + dst.offset) = *(src.data + src.offset);
1222  return 1;
1223  }
1224 
1225  // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
1226  const int dst_stride1_dim = IsColMajor
1227  ? num_size_one_inner_dims
1228  : NumDims - num_size_one_inner_dims - 1;
1229 
1230  // Dimension in the src that corresponds to the dst innermost dimension.
1231  const int src_dim_for_dst_stride1_dim =
1232  NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
1233 
1234  // Size of the innermost dimension (length of contiguous blocks of memory).
1235  IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
1236 
1237  // Squeeze multiple inner dims into one if they are contiguous in `dst` and
1238  // `src` memory, so we can do less linear copy calls.
1239  for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
1240  const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1241  const IndexType dst_stride = dst.strides[dst_dim];
1242  const IndexType src_stride = src.strides[dim_map[dst_dim]];
1243  if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
1244  dst_inner_dim_size *= dst.dims[dst_dim];
1245  ++num_size_one_inner_dims;
1246  } else {
1247  break;
1248  }
1249  }
1250 
1251  // Setup strides to read data from `src` and write to `dst`.
1252  IndexType input_offset = src.offset;
1253  IndexType output_offset = dst.offset;
1254  IndexType input_stride =
1255  NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
1256  IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
1257 
1258  const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
1259  array<BlockIteratorState, at_least_1_dim> it;
1260 
1261  // Initialize block iterator state. Squeeze away any dimension of size 1.
1262  int idx = 0; // currently initialized iterator state index
1263  for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
1264  const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
1265  if (dst.dims[dst_dim] == 1) continue;
1266 
1267  it[idx].size = dst.dims[dst_dim];
1268  it[idx].input_stride = src.strides[dim_map[dst_dim]];
1269  it[idx].output_stride = dst.strides[dst_dim];
1270 
1271  it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
1272  it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1273 
1274  idx++;
1275  }
1276 
1277  // Iterate copying data from src to dst.
1278  const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
1279 
1280 #define COPY_INNER_DIM(KIND) \
1281  IndexType num_copied = 0; \
1282  for (num_copied = 0; num_copied < block_total_size; \
1283  num_copied += dst_inner_dim_size) { \
1284  LinCopy::template Run<KIND>( \
1285  typename LinCopy::Dst(output_offset, output_stride, dst.data), \
1286  typename LinCopy::Src(input_offset, input_stride, src.data), \
1287  dst_inner_dim_size); \
1288  \
1289  for (int j = 0; j < idx; ++j) { \
1290  if (++it[j].count < it[j].size) { \
1291  input_offset += it[j].input_stride; \
1292  output_offset += it[j].output_stride; \
1293  break; \
1294  } \
1295  it[j].count = 0; \
1296  input_offset -= it[j].input_span; \
1297  output_offset -= it[j].output_span; \
1298  } \
1299  } \
1300  return num_copied;
1301 
1302  if (input_stride == 1 && output_stride == 1) {
1303  COPY_INNER_DIM(LinCopy::Kind::Linear);
1304  } else if (input_stride == 1 && output_stride != 1) {
1305  COPY_INNER_DIM(LinCopy::Kind::Scatter);
1306  } else if (input_stride == 0 && output_stride == 1) {
1307  COPY_INNER_DIM(LinCopy::Kind::FillLinear);
1308  } else if (input_stride == 0 && output_stride != 1) {
1309  COPY_INNER_DIM(LinCopy::Kind::FillScatter);
1310  } else if (output_stride == 1) {
1311  COPY_INNER_DIM(LinCopy::Kind::Gather);
1312  } else {
1313  COPY_INNER_DIM(LinCopy::Kind::Random);
1314  }
1315 
1316 #undef COPY_INNER_DIM
1317  }
1318 
1319  // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
1320  // the number of copied elements.
1321  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
1322  const Src& src) {
1323  DimensionsMap dst_to_src_map;
1324  for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
1325  return Copy(dst, src, dst_to_src_map);
1326  }
1327 
1328  private:
1329  struct BlockIteratorState {
1330  BlockIteratorState()
1331  : size(0),
1332  count(0),
1333  input_stride(0),
1334  output_stride(0),
1335  input_span(0),
1336  output_span(0) {}
1337 
1338  IndexType size;
1339  IndexType count;
1340  IndexType input_stride;
1341  IndexType output_stride;
1342  IndexType input_span;
1343  IndexType output_span;
1344  };
1345 
1346  // Compute how many inner dimensions it's allowed to squeeze when doing IO
1347  // between two tensor blocks. It's safe to squeeze inner dimensions, only
1348  // if they are not reordered.
1349  static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
1350  int num_squeezable_dims = 0;
1351  for (int i = 0; i < NumDims; ++i) {
1352  const int dim = IsColMajor ? i : NumDims - i - 1;
1353  if (dim_map[dim] != dim) break;
1354  num_squeezable_dims++;
1355  }
1356  return num_squeezable_dims;
1357  }
1358 };
1359 
1360 // -------------------------------------------------------------------------- //
1361 // TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
1362 // a Tensor block defined by `desc`, backed by a memory buffer at `target`.
1363 //
1364 // Currently there is no way to write from a Tensor expression to a block of
1365 // memory, if dimensions are reordered. If you need to do that, you should
1366 // materialize a Tensor block expression into a memory buffer, and then use
1367 // TensorBlockIO to copy data between two memory buffers with a custom
1368 // `target->src` dimension map (see definition above).
1369 //
1370 // Also currently the innermost dimension of `target` must have a stride '1'
1371 // (contiguous in memory). This restriction could be lifted with a `pscatter`,
1372 // but in practice it's never needed, and there is a similar TensorBlockIO
1373 // workaround for that.
1374 //
1375 // TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
1376 // where `src` is a tensor expression. Explore if it is possible to rewrite IO
1377 // to use expressions instead of pointers, and after that TensorBlockAssignment
1378 // will become an alias to IO.
1379 template <typename Scalar, int NumDims, typename TensorBlockExpr,
1380  typename IndexType = Eigen::Index>
1381 class TensorBlockAssignment {
1382  // We will use coeff/packet path to evaluate block expressions.
1383  typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
1384  TensorBlockEvaluator;
1385 
1386  typedef DSizes<IndexType, NumDims> Dimensions;
1387 
1388  enum {
1389  Vectorizable = packet_traits<Scalar>::Vectorizable,
1390  PacketSize = packet_traits<Scalar>::size
1391  };
1392 
1393  template <bool Vectorizable, typename Evaluator>
1394  struct InnerDimAssign {
1395  EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
1396  const Evaluator& eval,
1397  IndexType eval_offset) {
1398  for (IndexType i = 0; i < count; ++i) {
1399  target[i] = eval.coeff(eval_offset + i);
1400  }
1401  }
1402  };
1403 
1404  template <typename Evaluator>
1405  struct InnerDimAssign<true, Evaluator> {
1406  EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
1407  const Evaluator& eval,
1408  IndexType eval_offset) {
1409  typedef typename packet_traits<Scalar>::type Packet;
1410 
1411  const IndexType unrolled_size = count - 4 * PacketSize;
1412  const IndexType vectorized_size = count - PacketSize;
1413  IndexType i = 0;
1414 
1415  for (; i <= unrolled_size; i += 4 * PacketSize) {
1416  for (int j = 0; j < 4; ++j) {
1417  const IndexType idx = eval_offset + i + j * PacketSize;
1418  Packet p = eval.template packet<Unaligned>(idx);
1419  pstoreu<Scalar>(target + i + j * PacketSize, p);
1420  }
1421  }
1422 
1423  for (; i <= vectorized_size; i += PacketSize) {
1424  Packet p = eval.template packet<Unaligned>(eval_offset + i);
1425  pstoreu<Scalar>(target + i, p);
1426  }
1427 
1428  for (; i < count; ++i) {
1429  target[i] = eval.coeff(eval_offset + i);
1430  }
1431  }
1432  };
1433 
1434  public:
1435  struct Target {
1436  Target(const Dimensions& target_dims, const Dimensions& target_strides,
1437  Scalar* target_data, IndexType target_offset = 0)
1438  : dims(target_dims),
1439  strides(target_strides),
1440  data(target_data),
1441  offset(target_offset) {}
1442 
1443  Dimensions dims;
1444  Dimensions strides;
1445  Scalar* data;
1446  IndexType offset;
1447  };
1448 
1449  static Target target(const Dimensions& target_dims,
1450  const Dimensions& target_strides, Scalar* target_data,
1451  IndexType target_offset = 0) {
1452  return Target(target_dims, target_strides, target_data, target_offset);
1453  }
1454 
1455  template <typename TargetDimsIndexType, typename TargetStridesIndexType>
1456  static Target target(
1457  const DSizes<TargetDimsIndexType, NumDims>& target_dims,
1458  const DSizes<TargetStridesIndexType, NumDims>& target_strides,
1459  Scalar* target_data, IndexType target_offset = 0) {
1460  // DSizes constructor will do index type promotion if it's safe.
1461  return Target(Dimensions(target_dims), Dimensions(target_strides),
1462  target_data, target_offset);
1463  }
1464 
1465  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
1466  const Target& target, const TensorBlockExpr& expr) {
1467  // Prepare evaluator for block expression.
1468  DefaultDevice default_device;
1469  TensorBlockEvaluator eval(expr, default_device);
1470 
1471  // Tensor block expression dimension should match destination dimensions.
1472  eigen_assert(dimensions_match(target.dims, eval.dimensions()));
1473 
1474  static const int Layout = TensorBlockEvaluator::Layout;
1475  static const bool is_col_major = Layout == ColMajor;
1476 
1477  // Initialize output inner dimension size based on a layout.
1478  const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
1479  const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
1480  IndexType output_inner_dim_size = target.dims[inner_dim_idx];
1481 
1482  // Target inner dimension stride must be '1'.
1483  eigen_assert(target.strides[inner_dim_idx] == 1);
1484 
1485  // Squeeze multiple inner dims into one if they are contiguous in `target`.
1486  IndexType num_squeezed_dims = 0;
1487  for (Index i = 1; i < NumDims; ++i) {
1488  const Index dim = is_col_major ? i : NumDims - i - 1;
1489  const IndexType target_stride = target.strides[dim];
1490 
1491  if (output_inner_dim_size == target_stride) {
1492  output_inner_dim_size *= target.dims[dim];
1493  num_squeezed_dims++;
1494  } else {
1495  break;
1496  }
1497  }
1498 
1499  // Initialize output block iterator state. Dimension in this array are
1500  // always in inner_most -> outer_most order (col major layout).
1501  array<BlockIteratorState, NumDims> it;
1502 
1503  int idx = 0; // currently initialized iterator state index
1504  for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
1505  const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
1506 
1507  it[idx].count = 0;
1508  it[idx].size = target.dims[dim];
1509  it[idx].output_stride = target.strides[dim];
1510  it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1511  idx++;
1512  }
1513 
1514  // We read block expression from the beginning, and start writing data to
1515  // `target` at given offset.
1516  IndexType input_offset = 0;
1517  IndexType output_offset = target.offset;
1518 
1519  // Iterate copying data from `eval` to `target`.
1520  for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
1521  // Assign to `target` at current offset.
1522  InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
1523  TensorBlockEvaluator>::Run(target.data + output_offset,
1524  output_inner_dim_size, eval,
1525  input_offset);
1526 
1527  // Move input offset forward by the number of assigned coefficients.
1528  input_offset += output_inner_dim_size;
1529 
1530  // Update index.
1531  for (int j = 0; j < idx; ++j) {
1532  if (++it[j].count < it[j].size) {
1533  output_offset += it[j].output_stride;
1534  break;
1535  }
1536  it[j].count = 0;
1537  output_offset -= it[j].output_span;
1538  }
1539  }
1540  }
1541 
1542  private:
1543  struct BlockIteratorState {
1544  BlockIteratorState()
1545  : count(0), size(0), output_stride(0), output_span(0) {}
1546 
1547  IndexType count;
1548  IndexType size;
1549  IndexType output_stride;
1550  IndexType output_span;
1551  };
1552 };
1553 
1554 // -------------------------------------------------------------------------- //
1555 
1556 } // namespace internal
1557 } // namespace Eigen
1558 
1559 #endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index