Please, help us to better know about our user community by answering the following short survey: https://forms.gle/wpyrxWi18ox9Z5ae9
TensorBroadcasting.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
12 
13 namespace Eigen {
14 
22 namespace internal {
23 template<typename Broadcast, typename XprType>
24 struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType>
25 {
26  typedef typename XprType::Scalar Scalar;
27  typedef traits<XprType> XprTraits;
28  typedef typename XprTraits::StorageKind StorageKind;
29  typedef typename XprTraits::Index Index;
30  typedef typename XprType::Nested Nested;
31  typedef typename remove_reference<Nested>::type _Nested;
32  static const int NumDimensions = XprTraits::NumDimensions;
33  static const int Layout = XprTraits::Layout;
34  typedef typename XprTraits::PointerType PointerType;
35 };
36 
37 template<typename Broadcast, typename XprType>
38 struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense>
39 {
40  typedef const TensorBroadcastingOp<Broadcast, XprType> EIGEN_DEVICE_REF type;
41 };
42 
43 template<typename Broadcast, typename XprType>
44 struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorBroadcastingOp<Broadcast, XprType> >::type>
45 {
46  typedef TensorBroadcastingOp<Broadcast, XprType> type;
47 };
48 
49 template <typename Dims>
50 struct is_input_scalar {
51  static const bool value = false;
52 };
53 template <>
54 struct is_input_scalar<Sizes<> > {
55  static const bool value = true;
56 };
57 #ifndef EIGEN_EMULATE_CXX11_META_H
58 template <typename std::ptrdiff_t... Indices>
59 struct is_input_scalar<Sizes<Indices...> > {
60  static const bool value = (Sizes<Indices...>::total_size == 1);
61 };
62 #endif
63 
64 } // end namespace internal
65 
66 
67 
68 template<typename Broadcast, typename XprType>
69 class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors>
70 {
71  public:
72  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
73  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
74  typedef typename XprType::CoeffReturnType CoeffReturnType;
75  typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested;
76  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind;
77  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index;
78 
79  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast)
80  : m_xpr(expr), m_broadcast(broadcast) {}
81 
82  EIGEN_DEVICE_FUNC
83  const Broadcast& broadcast() const { return m_broadcast; }
84 
85  EIGEN_DEVICE_FUNC
86  const typename internal::remove_all<typename XprType::Nested>::type&
87  expression() const { return m_xpr; }
88 
89  protected:
90  typename XprType::Nested m_xpr;
91  const Broadcast m_broadcast;
92 };
93 
94 
95 // Eval as rvalue
96 template<typename Broadcast, typename ArgType, typename Device>
97 struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
98 {
99  typedef TensorBroadcastingOp<Broadcast, ArgType> XprType;
100  typedef typename XprType::Index Index;
101  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
102  typedef DSizes<Index, NumDims> Dimensions;
103  typedef typename XprType::Scalar Scalar;
104  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
105  typedef typename XprType::CoeffReturnType CoeffReturnType;
106  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
107  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
108  protected: // all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout;
109  bool isCopy, nByOne, oneByN;
110  public:
111  typedef StorageMemory<CoeffReturnType, Device> Storage;
112  typedef typename Storage::Type EvaluatorPointerType;
113 
114  enum {
115  IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
116  PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
117  BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
118  PreferBlockAccess = true,
119  Layout = TensorEvaluator<ArgType, Device>::Layout,
120  RawAccess = false
121  };
122 
123  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
124 
125  // We do block based broadcasting using a trick with 2x tensor rank and 0
126  // strides. See block method implementation for details.
127  typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
128 
129  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
130  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
131  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
132 
133  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
134  ArgTensorBlock;
135 
136  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
137  Layout, Index>
138  TensorBlock;
139  //===--------------------------------------------------------------------===//
140 
141  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
142  : isCopy(false), nByOne(false), oneByN(false),
143  m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device)
144  {
145 
146  // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
147  // and store the result in a scalar. Instead one should reshape the scalar into a a N-D
148  // tensor with N >= 1 of 1 element first and then broadcast.
149  EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
150  const InputDimensions& input_dims = m_impl.dimensions();
151  isCopy = true;
152  for (int i = 0; i < NumDims; ++i) {
153  eigen_assert(input_dims[i] > 0);
154  m_dimensions[i] = input_dims[i] * m_broadcast[i];
155  if (m_broadcast[i] != 1) {
156  isCopy = false;
157  }
158  }
159 
160  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
161  m_inputStrides[0] = 1;
162  m_outputStrides[0] = 1;
163  for (int i = 1; i < NumDims; ++i) {
164  m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
165  m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
166  }
167  } else {
168  m_inputStrides[NumDims-1] = 1;
169  m_outputStrides[NumDims-1] = 1;
170  for (int i = NumDims-2; i >= 0; --i) {
171  m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
172  m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
173  }
174  }
175 
176  if (input_dims[0] == 1) {
177  oneByN = true;
178  for (int i = 1; i < NumDims; ++i) {
179  if (m_broadcast[i] != 1) {
180  oneByN = false;
181  break;
182  }
183  }
184  } else if (input_dims[NumDims-1] == 1) {
185  nByOne = true;
186  for (int i = 0; i < NumDims-1; ++i) {
187  if (m_broadcast[i] != 1) {
188  nByOne = false;
189  break;
190  }
191  }
192  }
193 
194  // Handle special format like NCHW, its input shape is '[1, N..., 1]' and
195  // broadcast shape is '[N, 1..., N]'
196  if (!oneByN && !nByOne) {
197  if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) {
198  nByOne = true;
199  oneByN = true;
200  for (int i = 1; i < NumDims-1; ++i) {
201  if (m_broadcast[i] != 1) {
202  nByOne = false;
203  oneByN = false;
204  break;
205  }
206  }
207  }
208  }
209  }
210 
211  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
212 
213  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
214  m_impl.evalSubExprsIfNeeded(NULL);
215  return true;
216  }
217 
218 #ifdef EIGEN_USE_THREADS
219  template <typename EvalSubExprsCallback>
220  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
221  EvaluatorPointerType, EvalSubExprsCallback done) {
222  m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
223  }
224 #endif // EIGEN_USE_THREADS
225 
226  EIGEN_STRONG_INLINE void cleanup() {
227  m_impl.cleanup();
228  }
229 
230  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const
231  {
232  if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
233  return m_impl.coeff(0);
234  }
235 
236  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
237  if (isCopy) {
238  return m_impl.coeff(index);
239  } else {
240  return coeffColMajor(index);
241  }
242  } else {
243  if (isCopy) {
244  return m_impl.coeff(index);
245  } else {
246  return coeffRowMajor(index);
247  }
248  }
249  }
250 
251  // TODO: attempt to speed this up. The integer divisions and modulo are slow
252  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
253  Index inputIndex = 0;
254  EIGEN_UNROLL_LOOP
255  for (int i = NumDims - 1; i > 0; --i) {
256  const Index idx = index / m_outputStrides[i];
257  if (internal::index_statically_eq<Broadcast>(i, 1)) {
258  eigen_assert(idx < m_impl.dimensions()[i]);
259  inputIndex += idx * m_inputStrides[i];
260  } else {
261  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
262  eigen_assert(idx % m_impl.dimensions()[i] == 0);
263  } else {
264  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
265  }
266  }
267  index -= idx * m_outputStrides[i];
268  }
269  if (internal::index_statically_eq<Broadcast>(0, 1)) {
270  eigen_assert(index < m_impl.dimensions()[0]);
271  inputIndex += index;
272  } else {
273  if (internal::index_statically_eq<InputDimensions>(0, 1)) {
274  eigen_assert(index % m_impl.dimensions()[0] == 0);
275  } else {
276  inputIndex += (index % m_impl.dimensions()[0]);
277  }
278  }
279  return inputIndex;
280  }
281 
282  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
283  {
284  return m_impl.coeff(indexColMajor(index));
285  }
286 
287  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const {
288  Index inputIndex = 0;
289  EIGEN_UNROLL_LOOP
290  for (int i = 0; i < NumDims - 1; ++i) {
291  const Index idx = index / m_outputStrides[i];
292  if (internal::index_statically_eq<Broadcast>(i, 1)) {
293  eigen_assert(idx < m_impl.dimensions()[i]);
294  inputIndex += idx * m_inputStrides[i];
295  } else {
296  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
297  eigen_assert(idx % m_impl.dimensions()[i] == 0);
298  } else {
299  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
300  }
301  }
302  index -= idx * m_outputStrides[i];
303  }
304  if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
305  eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
306  inputIndex += index;
307  } else {
308  if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
309  eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
310  } else {
311  inputIndex += (index % m_impl.dimensions()[NumDims - 1]);
312  }
313  }
314  return inputIndex;
315  }
316 
317  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
318  {
319  return m_impl.coeff(indexRowMajor(index));
320  }
321 
322  template<int LoadMode>
323  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
324  {
325  if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
326  return internal::pset1<PacketReturnType>(m_impl.coeff(0));
327  }
328 
329  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
330  if (isCopy) {
331  #ifdef EIGEN_GPU_COMPILE_PHASE
332  // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
333  // unaligned loads here. The reason is unclear though.
334  return m_impl.template packet<Unaligned>(index);
335  #else
336  return m_impl.template packet<LoadMode>(index);
337  #endif
338  } else if (oneByN && !nByOne) {
339  return packetNByOne<LoadMode>(index);
340  } else if (!oneByN && nByOne) {
341  return packetOneByN<LoadMode>(index);
342  } else if (oneByN && nByOne) {
343  return packetOneByNByOne<LoadMode>(index);
344  } else {
345  return packetColMajor<LoadMode>(index);
346  }
347  } else {
348  if (isCopy) {
349  #ifdef EIGEN_GPU_COMPILE_PHASE
350  // See above.
351  return m_impl.template packet<Unaligned>(index);
352  #else
353  return m_impl.template packet<LoadMode>(index);
354  #endif
355  } else if (oneByN && !nByOne) {
356  return packetOneByN<LoadMode>(index);
357  } else if (!oneByN && nByOne) {
358  return packetNByOne<LoadMode>(index);
359  } else if (oneByN && nByOne) {
360  return packetOneByNByOne<LoadMode>(index);
361  } else {
362  return packetRowMajor<LoadMode>(index);
363  }
364  }
365  }
366 
367  template<int LoadMode>
368  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne
369  (Index index) const
370  {
371  EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
372  eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
373 
374  EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
375  Index startDim, endDim;
376  Index inputIndex, outputOffset, batchedIndex;
377 
378  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
379  startDim = NumDims - 1;
380  endDim = 1;
381  } else {
382  startDim = 0;
383  endDim = NumDims - 2;
384  }
385 
386  batchedIndex = index % m_outputStrides[startDim];
387  inputIndex = batchedIndex / m_outputStrides[endDim];
388  outputOffset = batchedIndex % m_outputStrides[endDim];
389 
390  if (outputOffset + PacketSize <= m_outputStrides[endDim]) {
391  values[0] = m_impl.coeff(inputIndex);
392  return internal::pload1<PacketReturnType>(values);
393  } else {
394  EIGEN_UNROLL_LOOP
395  for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
396  if (outputOffset + cur < m_outputStrides[endDim]) {
397  values[i] = m_impl.coeff(inputIndex);
398  } else {
399  ++inputIndex;
400  inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex);
401  values[i] = m_impl.coeff(inputIndex);
402  outputOffset = 0;
403  cur = 0;
404  }
405  }
406  return internal::pload<PacketReturnType>(values);
407  }
408  }
409 
410  template<int LoadMode>
411  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
412  {
413  EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
414  eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
415 
416  Index dim, inputIndex;
417 
418  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
419  dim = NumDims - 1;
420  } else {
421  dim = 0;
422  }
423 
424  inputIndex = index % m_inputStrides[dim];
425  if (inputIndex + PacketSize <= m_inputStrides[dim]) {
426  return m_impl.template packet<Unaligned>(inputIndex);
427  } else {
428  EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
429  EIGEN_UNROLL_LOOP
430  for (int i = 0; i < PacketSize; ++i) {
431  if (inputIndex > m_inputStrides[dim]-1) {
432  inputIndex = 0;
433  }
434  values[i] = m_impl.coeff(inputIndex++);
435  }
436  return internal::pload<PacketReturnType>(values);
437  }
438  }
439 
440  template<int LoadMode>
441  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const
442  {
443  EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
444  eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
445 
446  EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
447  Index dim, inputIndex, outputOffset;
448 
449  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
450  dim = 1;
451  } else {
452  dim = NumDims - 2;
453  }
454 
455  inputIndex = index / m_outputStrides[dim];
456  outputOffset = index % m_outputStrides[dim];
457  if (outputOffset + PacketSize <= m_outputStrides[dim]) {
458  values[0] = m_impl.coeff(inputIndex);
459  return internal::pload1<PacketReturnType>(values);
460  } else {
461  EIGEN_UNROLL_LOOP
462  for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
463  if (outputOffset + cur < m_outputStrides[dim]) {
464  values[i] = m_impl.coeff(inputIndex);
465  } else {
466  values[i] = m_impl.coeff(++inputIndex);
467  outputOffset = 0;
468  cur = 0;
469  }
470  }
471  return internal::pload<PacketReturnType>(values);
472  }
473  }
474 
475  // Ignore the LoadMode and always use unaligned loads since we can't guarantee
476  // the alignment at compile time.
477  template<int LoadMode>
478  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
479  {
480  EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
481  eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
482 
483  const Index originalIndex = index;
484 
485  Index inputIndex = 0;
486  EIGEN_UNROLL_LOOP
487  for (int i = NumDims - 1; i > 0; --i) {
488  const Index idx = index / m_outputStrides[i];
489  if (internal::index_statically_eq<Broadcast>(i, 1)) {
490  eigen_assert(idx < m_impl.dimensions()[i]);
491  inputIndex += idx * m_inputStrides[i];
492  } else {
493  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
494  eigen_assert(idx % m_impl.dimensions()[i] == 0);
495  } else {
496  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
497  }
498  }
499  index -= idx * m_outputStrides[i];
500  }
501  Index innermostLoc;
502  if (internal::index_statically_eq<Broadcast>(0, 1)) {
503  eigen_assert(index < m_impl.dimensions()[0]);
504  innermostLoc = index;
505  } else {
506  if (internal::index_statically_eq<InputDimensions>(0, 1)) {
507  eigen_assert(index % m_impl.dimensions()[0] == 0);
508  innermostLoc = 0;
509  } else {
510  innermostLoc = index % m_impl.dimensions()[0];
511  }
512  }
513  inputIndex += innermostLoc;
514 
515  // Todo: this could be extended to the second dimension if we're not
516  // broadcasting alongside the first dimension, and so on.
517  if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
518  return m_impl.template packet<Unaligned>(inputIndex);
519  } else {
520  EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
521  values[0] = m_impl.coeff(inputIndex);
522  EIGEN_UNROLL_LOOP
523  for (int i = 1; i < PacketSize; ++i) {
524  if (innermostLoc + i < m_impl.dimensions()[0]) {
525  values[i] = m_impl.coeff(inputIndex+i);
526  } else {
527  values[i] = coeffColMajor(originalIndex+i);
528  }
529  }
530  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
531  return rslt;
532  }
533  }
534 
535  template<int LoadMode>
536  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
537  {
538  EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
539  eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
540 
541  const Index originalIndex = index;
542 
543  Index inputIndex = 0;
544  EIGEN_UNROLL_LOOP
545  for (int i = 0; i < NumDims - 1; ++i) {
546  const Index idx = index / m_outputStrides[i];
547  if (internal::index_statically_eq<Broadcast>(i, 1)) {
548  eigen_assert(idx < m_impl.dimensions()[i]);
549  inputIndex += idx * m_inputStrides[i];
550  } else {
551  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
552  eigen_assert(idx % m_impl.dimensions()[i] == 0);
553  } else {
554  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
555  }
556  }
557  index -= idx * m_outputStrides[i];
558  }
559  Index innermostLoc;
560  if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
561  eigen_assert(index < m_impl.dimensions()[NumDims-1]);
562  innermostLoc = index;
563  } else {
564  if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
565  eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
566  innermostLoc = 0;
567  } else {
568  innermostLoc = index % m_impl.dimensions()[NumDims-1];
569  }
570  }
571  inputIndex += innermostLoc;
572 
573  // Todo: this could be extended to the second dimension if we're not
574  // broadcasting alongside the first dimension, and so on.
575  if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) {
576  return m_impl.template packet<Unaligned>(inputIndex);
577  } else {
578  EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
579  values[0] = m_impl.coeff(inputIndex);
580  EIGEN_UNROLL_LOOP
581  for (int i = 1; i < PacketSize; ++i) {
582  if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) {
583  values[i] = m_impl.coeff(inputIndex+i);
584  } else {
585  values[i] = coeffRowMajor(originalIndex+i);
586  }
587  }
588  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
589  return rslt;
590  }
591  }
592 
593  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
594  costPerCoeff(bool vectorized) const {
595  double compute_cost = TensorOpCost::AddCost<Index>();
596  if (!isCopy && NumDims > 0) {
597  EIGEN_UNROLL_LOOP
598  for (int i = NumDims - 1; i > 0; --i) {
599  compute_cost += TensorOpCost::DivCost<Index>();
600  if (internal::index_statically_eq<Broadcast>(i, 1)) {
601  compute_cost +=
602  TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
603  } else {
604  if (!internal::index_statically_eq<InputDimensions>(i, 1)) {
605  compute_cost += TensorOpCost::MulCost<Index>() +
606  TensorOpCost::ModCost<Index>() +
607  TensorOpCost::AddCost<Index>();
608  }
609  }
610  compute_cost +=
611  TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
612  }
613  }
614  return m_impl.costPerCoeff(vectorized) +
615  TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
616  }
617 
618  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
619  internal::TensorBlockResourceRequirements getResourceRequirements() const {
620  // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
621  // tensors. But this might need further tuning.
622  const size_t target_size = m_device.firstLevelCacheSize();
623  return internal::TensorBlockResourceRequirements::merge(
624  m_impl.getResourceRequirements(),
625  internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
626  }
627 
628  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
629  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
630  bool /*root_of_expr_ast*/ = false) const {
631  BlockBroadcastingParams params = blockBroadcastingParams(desc);
632 
633  if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) {
634  return emptyBlock();
635  }
636 
637  // Prepare storage for the materialized broadcasting result.
638  const typename TensorBlock::Storage block_storage =
639  TensorBlock::prepareStorage(desc, scratch);
640  ScalarNoConst* materialized_output = block_storage.data();
641 
642  // We potentially will need to materialize input blocks.
643  size_t materialized_input_size = 0;
644  ScalarNoConst* materialized_input = NULL;
645 
646  // Initialize block broadcating iterator state for outer dimensions (outer
647  // with regard to bcast dimension). Dimension in this array are always in
648  // inner_most -> outer_most order (col major layout).
649  array<BlockBroadcastingIteratorState, NumDims> it;
650  int idx = 0;
651 
652  for (int i = params.inner_dim_count + 1; i < NumDims; ++i) {
653  const Index dim = IsColMajor ? i : NumDims - 1 - i;
654  it[idx].size = params.output_dims[dim];
655  it[idx].count = 0;
656  it[idx].output_stride = m_outputStrides[dim];
657  it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
658  idx++;
659  }
660 
661  // Write output into the beginning of `materialized_output`.
662  Index output_offset = 0;
663 
664  // We will fill output block by broadcasting along the bcast dim, and
665  // iterating over outer dimension.
666  const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize();
667 
668  for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) {
669  ScalarNoConst* bcast_output = materialized_output + num_output_coeffs;
670  Index bcast_offset = desc.offset() + output_offset;
671 
672  // Broadcast along the bcast dimension.
673  num_output_coeffs += BroadcastBlockAlongBcastDim(
674  params, bcast_offset, scratch, bcast_output, &materialized_input,
675  &materialized_input_size);
676 
677  // Switch to the next outer dimension.
678  for (int j = 0; j < idx; ++j) {
679  if (++it[j].count < it[j].size) {
680  output_offset += it[j].output_stride;
681  break;
682  }
683  it[j].count = 0;
684  output_offset -= it[j].output_span;
685  }
686  }
687 
688  return block_storage.AsTensorMaterializedBlock();
689  }
690 
691  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
692 
693  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
694 
695  Broadcast functor() const { return m_broadcast; }
696 #ifdef EIGEN_USE_SYCL
697  // binding placeholder accessors to a command group handler for SYCL
698  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
699  cl::sycl::handler& cgh) const {
700  m_impl.bind(cgh);
701  }
702 #endif
703  private:
704  static const bool IsColMajor =
705  static_cast<int>(Layout) == static_cast<int>(ColMajor);
706 
707  // We will build a general case block broadcasting on top of broadcasting
708  // primitive that will do broadcasting only for the inner dimension(s) along
709  // the first dimension smaller than the input size (it's called `bcast_dim`).
710  //
711  // Example:
712  // dim: 0 1 2 (ColMajor)
713  // input size: [9, 3, 6]
714  // block size: [9, 2, 6]
715  //
716  // We will compute broadcasted block by iterating over the outer dimensions
717  // before `bcast_dim` (only dimension `2` in this example) and computing
718  // broadcasts along the `bcast_dim` (dimension `1` in this example).
719 
720  // BlockBroadcastingParams holds precomputed parameters for broadcasting a
721  // single block along the broadcasting dimension. Sizes and strides along the
722  // `bcast_dim` might be invalid, they will be adjusted later in
723  // `BroadcastBlockAlongBcastDim`.
724  struct BlockBroadcastingParams {
725  Dimensions input_dims; // input expression dimensions
726  Dimensions output_dims; // output block sizes
727  Dimensions output_strides; // output block strides
728 
729  int inner_dim_count; // count inner dimensions matching in size
730  int bcast_dim; // broadcasting dimension index
731  Index bcast_dim_size; // broadcasting dimension size
732  Index inner_dim_size; // inner dimensions size
733 
734  // Block sizes and strides for the input block where all dimensions before
735  // `bcast_dim` are equal to `1`.
736  Dimensions input_block_sizes;
737  Dimensions input_block_strides;
738 
739  // Block sizes and strides for blocks with extra dimensions and strides `0`.
740  BroadcastDimensions bcast_block_sizes;
741  BroadcastDimensions bcast_block_strides;
742  BroadcastDimensions bcast_input_strides;
743  };
744 
745  struct BlockBroadcastingIteratorState {
746  Index size;
747  Index count;
748  Index output_stride;
749  Index output_span;
750  };
751 
752  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams
753  blockBroadcastingParams(TensorBlockDesc& desc) const {
754  BlockBroadcastingParams params;
755 
756  params.input_dims = Dimensions(m_impl.dimensions());
757 
758  // Output block sizes and strides.
759  params.output_dims = desc.dimensions();
760  params.output_strides = internal::strides<Layout>(params.output_dims);
761 
762  // Find the broadcasting dimension (first dimension with output size smaller
763  // that the input size).
764  params.bcast_dim = 0;
765  params.bcast_dim_size = 1;
766  params.inner_dim_size = 1;
767 
768  // Count the number of inner dimensions that have the same size in the block
769  // and in the broadcast expression.
770  params.inner_dim_count = 0;
771 
772  for (int i = 0; i < NumDims; ++i) {
773  const int dim = IsColMajor ? i : NumDims - i - 1;
774 
775  if (params.output_dims[dim] == m_dimensions[dim]) {
776  params.inner_dim_size *= params.output_dims[dim];
777  ++params.inner_dim_count;
778  continue;
779  }
780 
781  // First non-matching dimension is the broadcasting dimension.
782  eigen_assert(params.output_dims[dim] < m_dimensions[dim]);
783  params.bcast_dim = dim;
784  params.bcast_dim_size = params.output_dims[dim];
785  break;
786  }
787 
788  // Calculate the input block size for looking into the input.
789  for (int i = 0; i < params.inner_dim_count; ++i) {
790  const int dim = IsColMajor ? i : NumDims - i - 1;
791  params.input_block_sizes[dim] = params.input_dims[dim];
792  }
793  for (int i = params.inner_dim_count; i < NumDims; ++i) {
794  const int dim = IsColMajor ? i : NumDims - i - 1;
795  params.input_block_sizes[dim] = 1;
796  }
797  params.input_block_strides =
798  internal::strides<Layout>(params.input_block_sizes);
799 
800  // Broadcast with the 0-stride trick: Create 1 extra dim for each
801  // broadcast, set the input stride to 0.
802  //
803  // When ColMajor:
804  //
805  // - bcast_block_sizes:
806  // [d_0, b_0, d_1, b_1, ...]
807  //
808  // - bcast_block_strides:
809  // [output_block_strides[0], output_block_strides[0] * d_0,
810  // output_block_strides[1], output_block_strides[1] * d_1,
811  // ...]
812  //
813  // - bcast_input_strides:
814  // [input_block_strides[0], 0,
815  // input_block_strides[1], 0,
816  // ...].
817  //
818  for (int i = 0; i < params.inner_dim_count; ++i) {
819  const int dim = IsColMajor ? i : NumDims - i - 1;
820 
821  const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1;
822  const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1;
823 
824  params.bcast_block_sizes[copy_dim] = params.input_dims[dim];
825  params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
826  params.bcast_block_strides[copy_dim] = params.output_strides[dim];
827  params.bcast_block_strides[broadcast_dim] =
828  params.output_strides[dim] * params.input_dims[dim];
829  params.bcast_input_strides[copy_dim] = params.input_block_strides[dim];
830  params.bcast_input_strides[broadcast_dim] = 0;
831  }
832 
833  for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) {
834  const int dim = IsColMajor ? i : 2 * NumDims - i - 1;
835  params.bcast_block_sizes[dim] = 1;
836  params.bcast_block_strides[dim] = 0;
837  params.bcast_input_strides[dim] = 0;
838  }
839 
840  return params;
841  }
842 
843  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const {
844  DSizes<Index, NumDims> dimensions;
845  for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
846  return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions);
847  }
848 
849  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim(
850  BlockBroadcastingParams params, Index bcast_offset,
851  TensorBlockScratch& scratch, ScalarNoConst* materialized_output,
852  ScalarNoConst** materialized_input,
853  size_t* materialized_input_size) const {
854  if (params.bcast_dim_size == 1) {
855  // We just need one block read using the ready-set values above.
856  return BroadcastBlock(
857  params.input_block_sizes, params.input_block_strides,
858  params.bcast_block_sizes, params.bcast_block_strides,
859  params.bcast_input_strides, bcast_offset, 0, scratch,
860  materialized_output, materialized_input, materialized_input_size);
861 
862  } else if (params.input_dims[params.bcast_dim] == 1) {
863  // Broadcast bcast dimension (< NumDims) by bcast_dim_size.
864  const int broadcast_bcast_dim =
865  IsColMajor ? 2 * params.inner_dim_count + 1
866  : 2 * NumDims - 2 * params.inner_dim_count - 2;
867 
868  params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size;
869  params.bcast_input_strides[broadcast_bcast_dim] = 0;
870  params.bcast_block_strides[broadcast_bcast_dim] =
871  params.output_strides[params.bcast_dim];
872 
873  return BroadcastBlock(
874  params.input_block_sizes, params.input_block_strides,
875  params.bcast_block_sizes, params.bcast_block_strides,
876  params.bcast_input_strides, bcast_offset, 0, scratch,
877  materialized_output, materialized_input, materialized_input_size);
878 
879  } else {
880  // Keep track of the total number of the coefficients written to the
881  // output block.
882  Index num_output_coeffs = 0;
883 
884  // The general case. Let's denote the output block as
885  //
886  // x[..., a:a+bcast_dim_size, :, ..., :]
887  //
888  // where a:a+bcast_dim_size is a slice on the bcast_dim dimension
889  // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3
890  // sub-blocks:
891  //
892  // (1) a:b, where b is the smallest multiple of
893  // input_dims[bcast_dim_start] in [a, a+bcast_dim_size].
894  //
895  // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start]
896  // in [a, a+bcast_dim_size].
897  //
898  // (3) c:a+bcast_dim_size .
899  //
900  // Or, when b and c do not exist, we just need to process the whole block
901  // together.
902 
903  // Find a.
904  const Index bcast_dim_left_index =
905  bcast_offset / m_outputStrides[params.bcast_dim];
906 
907  // Find b and c.
908  const Index input_bcast_dim_size = params.input_dims[params.bcast_dim];
909 
910  // First multiple after a. This is b when <= bcast_dim_left_index +
911  // bcast_dim_size.
912  const Index first_multiple =
913  divup<Index>(bcast_dim_left_index, input_bcast_dim_size) *
914  input_bcast_dim_size;
915 
916  if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
917  // b exists, so does c. Find it.
918  const Index last_multiple =
919  (bcast_dim_left_index + params.bcast_dim_size) /
920  input_bcast_dim_size * input_bcast_dim_size;
921  const int copy_bcast_dim =
922  IsColMajor ? 2 * params.inner_dim_count
923  : 2 * NumDims - 2 * params.inner_dim_count - 1;
924  const int broadcast_bcast_dim =
925  IsColMajor ? 2 * params.inner_dim_count + 1
926  : 2 * NumDims - 2 * params.inner_dim_count - 2;
927 
928  if (first_multiple > bcast_dim_left_index) {
929  const Index head_size = first_multiple - bcast_dim_left_index;
930  params.input_block_sizes[params.bcast_dim] = head_size;
931  params.bcast_block_sizes[copy_bcast_dim] = head_size;
932  params.bcast_input_strides[copy_bcast_dim] =
933  params.input_block_strides[params.bcast_dim];
934  params.bcast_block_strides[copy_bcast_dim] =
935  params.output_strides[params.bcast_dim];
936  params.bcast_block_sizes[broadcast_bcast_dim] = 1;
937  params.bcast_input_strides[broadcast_bcast_dim] = 0;
938  params.bcast_block_strides[broadcast_bcast_dim] =
939  params.output_strides[params.bcast_dim] *
940  params.input_dims[params.bcast_dim];
941 
942  num_output_coeffs += BroadcastBlock(
943  params.input_block_sizes, params.input_block_strides,
944  params.bcast_block_sizes, params.bcast_block_strides,
945  params.bcast_input_strides, bcast_offset, 0, scratch,
946  materialized_output, materialized_input, materialized_input_size);
947  }
948  if (first_multiple < last_multiple) {
949  params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size;
950  params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size;
951  params.bcast_input_strides[copy_bcast_dim] =
952  params.input_block_strides[params.bcast_dim];
953  params.bcast_block_strides[copy_bcast_dim] =
954  params.output_strides[params.bcast_dim];
955  params.bcast_block_sizes[broadcast_bcast_dim] =
956  (last_multiple - first_multiple) / input_bcast_dim_size;
957  params.bcast_input_strides[broadcast_bcast_dim] = 0;
958  params.bcast_block_strides[broadcast_bcast_dim] =
959  params.output_strides[params.bcast_dim] *
960  params.input_dims[params.bcast_dim];
961  const Index offset = (first_multiple - bcast_dim_left_index) *
962  m_outputStrides[params.bcast_dim];
963 
964  num_output_coeffs += BroadcastBlock(
965  params.input_block_sizes, params.input_block_strides,
966  params.bcast_block_sizes, params.bcast_block_strides,
967  params.bcast_input_strides, bcast_offset, offset, scratch,
968  materialized_output, materialized_input, materialized_input_size);
969  }
970  if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) {
971  const Index tail_size =
972  bcast_dim_left_index + params.bcast_dim_size - last_multiple;
973  params.input_block_sizes[params.bcast_dim] = tail_size;
974  params.bcast_block_sizes[copy_bcast_dim] = tail_size;
975  params.bcast_input_strides[copy_bcast_dim] =
976  params.input_block_strides[params.bcast_dim];
977  params.bcast_block_strides[copy_bcast_dim] =
978  params.output_strides[params.bcast_dim];
979  params.bcast_block_sizes[broadcast_bcast_dim] = 1;
980  params.bcast_input_strides[broadcast_bcast_dim] = 0;
981  params.bcast_block_strides[broadcast_bcast_dim] =
982  params.output_strides[params.bcast_dim] *
983  params.input_dims[params.bcast_dim];
984  const Index offset = (last_multiple - bcast_dim_left_index) *
985  m_outputStrides[params.bcast_dim];
986 
987  num_output_coeffs += BroadcastBlock(
988  params.input_block_sizes, params.input_block_strides,
989  params.bcast_block_sizes, params.bcast_block_strides,
990  params.bcast_input_strides, bcast_offset, offset, scratch,
991  materialized_output, materialized_input, materialized_input_size);
992  }
993  } else {
994  // b and c do not exist.
995  const int copy_bcast_dim =
996  IsColMajor ? 2 * params.inner_dim_count
997  : 2 * NumDims - 2 * params.inner_dim_count - 1;
998  params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size;
999  params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size;
1000  params.bcast_input_strides[copy_bcast_dim] =
1001  params.input_block_strides[params.bcast_dim];
1002  params.bcast_block_strides[copy_bcast_dim] =
1003  params.output_strides[params.bcast_dim];
1004 
1005  num_output_coeffs += BroadcastBlock(
1006  params.input_block_sizes, params.input_block_strides,
1007  params.bcast_block_sizes, params.bcast_block_strides,
1008  params.bcast_input_strides, bcast_offset, 0, scratch,
1009  materialized_output, materialized_input, materialized_input_size);
1010  }
1011 
1012  return num_output_coeffs;
1013  }
1014  }
1015 
1016  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock(
1017  const Dimensions& input_block_sizes,
1018  const Dimensions& input_block_strides,
1019  const BroadcastDimensions& bcast_block_sizes,
1020  const BroadcastDimensions& bcast_block_strides,
1021  const BroadcastDimensions& bcast_input_strides, Index bcast_offset,
1022  Index offset, TensorBlockScratch& scratch,
1023  ScalarNoConst* materialized_output, ScalarNoConst** materialized_input,
1024  size_t* materialized_input_size) const {
1025  // ---------------------------------------------------------------------- //
1026  // Tensor block descriptor for reading block from the input.
1027  const Index input_offset = bcast_offset + offset;
1028  TensorBlockDesc input_desc(
1029  IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset),
1030  input_block_sizes);
1031 
1032  ArgTensorBlock input_block = m_impl.block(input_desc, scratch);
1033 
1034  // ---------------------------------------------------------------------- //
1035  // Materialize input block into a temporary memory buffer only if it's not
1036  // already available in the arg block.
1037  const ScalarNoConst* input_buffer = NULL;
1038 
1039  if (input_block.data() != NULL) {
1040  // Input block already has raw data, there is no need to materialize it.
1041  input_buffer = input_block.data();
1042 
1043  } else {
1044  // Otherwise we have to do block assignment into a temporary buffer.
1045 
1046  // Maybe reuse previously allocated buffer, or allocate a new one with a
1047  // scratch allocator.
1048  const size_t input_total_size = input_block_sizes.TotalSize();
1049  if (*materialized_input == NULL ||
1050  *materialized_input_size < input_total_size) {
1051  *materialized_input_size = input_total_size;
1052  void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
1053  *materialized_input = static_cast<ScalarNoConst*>(mem);
1054  }
1055 
1056  typedef internal::TensorBlockAssignment<
1057  ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
1058  TensorBlockAssignment;
1059 
1060  TensorBlockAssignment::Run(
1061  TensorBlockAssignment::target(input_block_sizes, input_block_strides,
1062  *materialized_input),
1063  input_block.expr());
1064 
1065  input_buffer = *materialized_input;
1066  }
1067 
1068  // ---------------------------------------------------------------------- //
1069  // Copy data from materialized input block to the materialized output, using
1070  // given broadcast strides (strides with zeroes).
1071  typedef internal::TensorBlockIO<ScalarNoConst, Index, 2 * NumDims, Layout>
1072  TensorBlockIO;
1073 
1074  typename TensorBlockIO::Src src(bcast_input_strides, input_buffer);
1075  typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides,
1076  materialized_output + offset);
1077 
1078  return TensorBlockIO::Copy(dst, src);
1079  }
1080 
1081 protected:
1082  const Device EIGEN_DEVICE_REF m_device;
1083  const typename internal::remove_reference<Broadcast>::type m_broadcast;
1084  Dimensions m_dimensions;
1085  array<Index, NumDims> m_outputStrides;
1086  array<Index, NumDims> m_inputStrides;
1087  TensorEvaluator<ArgType, Device> m_impl;
1088 };
1089 
1090 
1091 } // end namespace Eigen
1092 
1093 #endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index