8 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H 9 #define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H 16 template <
typename Scalar,
typename IndexType,
int NumDims,
int Layout>
25 template <
int Layout,
typename IndexType,
int NumDims>
26 EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
27 const DSizes<IndexType, NumDims>& dimensions) {
28 DSizes<IndexType, NumDims> strides;
29 if (NumDims == 0)
return strides;
33 if (static_cast<int>(Layout) == static_cast<int>(
ColMajor)) {
35 for (
int i = 1; i < NumDims; ++i) {
36 strides[i] = strides[i - 1] * dimensions[i - 1];
39 strides[NumDims - 1] = 1;
40 for (
int i = NumDims - 2; i >= 0; --i) {
41 strides[i] = strides[i + 1] * dimensions[i + 1];
48 template <
int Layout,
typename IndexType,
size_t NumDims>
49 EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
50 const Eigen::array<IndexType, NumDims>& dimensions) {
51 return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
54 template <
int Layout, std::ptrdiff_t... Indices>
55 EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t,
sizeof...(Indices)> strides(
56 const Sizes<Indices...>& sizes) {
57 return strides<Layout>(DSizes<std::ptrdiff_t,
sizeof...(Indices)>(sizes));
73 enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
75 struct TensorBlockResourceRequirements {
76 TensorBlockShapeType shape_type;
78 TensorOpCost cost_per_coeff;
85 TensorBlockResourceRequirements(TensorBlockShapeType shape_type_,
size_t size_,
87 : shape_type(shape_type_), size(size_), cost_per_coeff(cost_)
91 template <
typename Scalar>
92 EIGEN_DEVICE_FUNC
static TensorBlockResourceRequirements withShapeAndSize(
93 TensorBlockShapeType shape_type,
size_t size_in_bytes,
95 const size_t size = numext::maxi(
size_t(1), size_in_bytes /
sizeof(Scalar));
96 return {shape_type, size, cost};
99 template <
typename Scalar>
100 EIGEN_DEVICE_FUNC
static TensorBlockResourceRequirements withShapeAndSize(
101 TensorBlockShapeType shape_type,
size_t size_in_bytes) {
116 return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
122 template <
typename Scalar>
123 EIGEN_DEVICE_FUNC
static TensorBlockResourceRequirements skewed(
124 size_t size_in_bytes) {
125 return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
129 template <
typename Scalar>
130 EIGEN_DEVICE_FUNC
static TensorBlockResourceRequirements uniform(
131 size_t size_in_bytes) {
132 return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
137 static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
138 merge(
const TensorBlockResourceRequirements& lhs,
139 const TensorBlockResourceRequirements& rhs) {
140 return {merge(lhs.shape_type, rhs.shape_type),
141 merge(lhs.size, rhs.size),
142 merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};
145 EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
147 cost_per_coeff += cost;
155 static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
156 return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
160 using Requirements = TensorBlockResourceRequirements;
163 static EIGEN_STRONG_INLINE
size_t merge(
size_t lhs_size,
size_t rhs_size) {
164 return numext::maxi(lhs_size, rhs_size);
168 static EIGEN_STRONG_INLINE TensorBlockShapeType
169 merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {
170 return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
171 rhs == TensorBlockShapeType::kSkewedInnerDims)
172 ? TensorBlockShapeType::kSkewedInnerDims
173 : TensorBlockShapeType::kUniformAllDims;
177 static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
178 TensorOpCost rhs_cost) {
179 return lhs_cost + rhs_cost;
187 template <
int NumDims,
typename IndexType = Eigen::Index>
188 class TensorBlockDescriptor {
190 typedef DSizes<IndexType, NumDims> Dimensions;
202 class DestinationBuffer {
204 enum DestinationBufferKind :
int {
235 template <
typename Scalar>
236 Scalar* data()
const {
237 eigen_assert(m_data_type_size ==
sizeof(Scalar));
238 return static_cast<Scalar*
>(m_data);
241 const Dimensions& strides()
const {
return m_strides; }
242 const DestinationBufferKind& kind()
const {
return m_kind; }
245 friend class TensorBlockDescriptor;
247 DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
249 template <
typename Scalar>
250 DestinationBuffer(Scalar* data,
const Dimensions& strides,
251 DestinationBufferKind kind)
252 : m_data(static_cast<void*>(data)),
253 m_data_type_size(sizeof(Scalar)),
257 template <
int Layout,
typename Scalar>
258 static DestinationBuffer make(
const TensorBlockDescriptor& desc,
259 Scalar* data,
const Dimensions& strides) {
260 return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
263 template <
int Layout>
264 static DestinationBufferKind kind(
const TensorBlockDescriptor& desc,
265 const Dimensions& strides) {
266 const Dimensions& desc_dims = desc.dimensions();
267 const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
268 for (
int i = 0; i < NumDims; ++i) {
269 if (desc_dims[i] == 1)
continue;
270 if (desc_strides[i] != strides[i])
return kStrided;
278 size_t m_data_type_size;
282 Dimensions m_strides;
284 DestinationBufferKind m_kind;
287 TensorBlockDescriptor(
const IndexType offset,
const Dimensions& dimensions,
288 const DestinationBuffer& destination)
290 m_dimensions(dimensions),
291 m_destination(destination) {}
293 TensorBlockDescriptor(
const IndexType offset,
const Dimensions& dimensions)
295 m_dimensions(dimensions),
296 m_destination(DestinationBuffer()) {}
298 IndexType offset()
const {
return m_offset; }
299 const Dimensions& dimensions()
const {
return m_dimensions; }
300 IndexType dimension(
int index)
const {
return m_dimensions[index]; }
301 IndexType size()
const {
return array_prod<IndexType>(m_dimensions); }
303 const DestinationBuffer& destination()
const {
return m_destination; }
305 template <
int Layout,
typename Scalar>
306 void AddDestinationBuffer(Scalar* dst_base,
const Dimensions& dst_strides) {
307 eigen_assert(dst_base != NULL);
309 DestinationBuffer::template make<Layout>(*
this, dst_base, dst_strides);
312 template <
int Layout,
typename Scalar,
typename DstStr
idesIndexType>
313 void AddDestinationBuffer(
315 const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
317 AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
320 TensorBlockDescriptor& DropDestinationBuffer() {
321 m_destination.m_data = NULL;
322 m_destination.m_kind = DestinationBuffer::kEmpty;
326 bool HasDestinationBuffer()
const {
327 return m_destination.kind() != DestinationBuffer::kEmpty;
331 TensorBlockDescriptor WithOffset(IndexType offset)
const {
332 return TensorBlockDescriptor(offset, m_dimensions, m_destination);
338 const IndexType m_offset;
339 const Dimensions m_dimensions;
340 DestinationBuffer m_destination;
346 template <
int NumDims,
int Layout,
typename IndexType = Eigen::Index>
347 class TensorBlockMapper {
348 typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
351 typedef DSizes<IndexType, NumDims> Dimensions;
353 TensorBlockMapper() =
default;
354 TensorBlockMapper(
const DSizes<IndexType, NumDims>& dimensions,
355 const TensorBlockResourceRequirements& requirements)
356 : m_tensor_dimensions(dimensions), m_requirements(requirements) {
358 InitializeBlockDimensions();
361 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount()
const {
362 return m_total_block_count;
365 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize()
const {
366 return m_block_dimensions.TotalSize();
369 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const DSizes<IndexType, NumDims>&
370 blockDimensions()
const {
371 return m_block_dimensions;
374 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
375 blockDescriptor(IndexType block_index)
const {
376 static const bool isColMajor = Layout ==
static_cast<int>(
ColMajor);
378 IndexType offset = 0;
379 DSizes<IndexType, NumDims> dimensions;
381 if (NumDims == 0)
return BlockDescriptor(offset, dimensions);
384 for (
int i = NumDims - 1; i >= 0; --i) {
385 const int dim = isColMajor ? i : NumDims - i - 1;
387 const IndexType idx = block_index / m_block_strides[dim];
388 block_index -= idx * m_block_strides[dim];
390 const IndexType coord = idx * m_block_dimensions[dim];
391 dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
392 m_block_dimensions[dim]);
393 offset += coord * m_tensor_strides[dim];
396 return {offset, dimensions};
400 void InitializeBlockDimensions() {
402 const TensorBlockShapeType shape_type = m_requirements.shape_type;
403 IndexType target_block_size =
404 numext::maxi<IndexType>(1,
static_cast<IndexType
>(m_requirements.size));
406 IndexType tensor_size = m_tensor_dimensions.TotalSize();
412 if (tensor_size == 0) {
413 for (
int i = 0; i < NumDims; ++i) {
414 m_block_dimensions[i] = 1;
416 m_total_block_count = 0;
421 if (tensor_size <= target_block_size) {
422 m_block_dimensions = m_tensor_dimensions;
423 m_total_block_count = 1;
426 for (
int i = 0; i < NumDims; ++i) {
427 m_tensor_strides[i] = 0;
428 m_block_strides[i] = 1;
433 static const bool isColMajor = Layout ==
static_cast<int>(
ColMajor);
436 if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
437 IndexType coeff_to_allocate = target_block_size;
439 for (
int i = 0; i < NumDims; ++i) {
440 const int dim = isColMajor ? i : NumDims - i - 1;
441 m_block_dimensions[dim] =
442 numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
443 coeff_to_allocate = divup(
445 numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
447 eigen_assert(coeff_to_allocate == 1);
449 }
else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
452 const IndexType dim_size_target = convert_index<IndexType>(
453 std::pow(static_cast<float>(target_block_size),
454 1.0f / static_cast<float>(m_block_dimensions.rank())));
456 for (
int i = 0; i < NumDims; ++i) {
461 m_block_dimensions[i] =
462 numext::mini(dim_size_target, m_tensor_dimensions[i]);
466 IndexType total_size = m_block_dimensions.TotalSize();
467 for (
int i = 0; i < NumDims; ++i) {
468 const int dim = isColMajor ? i : NumDims - i - 1;
470 if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
471 const IndexType total_size_other_dims =
472 total_size / m_block_dimensions[dim];
473 const IndexType alloc_avail =
474 divup<IndexType>(target_block_size, total_size_other_dims);
475 if (alloc_avail == m_block_dimensions[dim]) {
479 m_block_dimensions[dim] =
480 numext::mini(m_tensor_dimensions[dim], alloc_avail);
481 total_size = total_size_other_dims * m_block_dimensions[dim];
489 eigen_assert(m_block_dimensions.TotalSize() >=
490 numext::mini<IndexType>(target_block_size,
491 m_tensor_dimensions.TotalSize()));
494 DSizes<IndexType, NumDims> block_count;
495 for (
int i = 0; i < NumDims; ++i) {
496 block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
498 m_total_block_count = array_prod(block_count);
501 m_tensor_strides = strides<Layout>(m_tensor_dimensions);
502 m_block_strides = strides<Layout>(block_count);
505 DSizes<IndexType, NumDims> m_tensor_dimensions;
506 TensorBlockResourceRequirements m_requirements;
508 DSizes<IndexType, NumDims> m_block_dimensions;
509 IndexType m_total_block_count;
511 DSizes<IndexType, NumDims> m_tensor_strides;
512 DSizes<IndexType, NumDims> m_block_strides;
524 template <
typename Device>
525 class TensorBlockScratchAllocator {
527 explicit TensorBlockScratchAllocator(
const Device& device)
528 : m_device(device), m_allocation_index(0) {}
530 ~TensorBlockScratchAllocator() {
531 for (
size_t i = 0; i < m_allocations.size(); ++i) {
532 m_device.deallocate(m_allocations[i].ptr);
536 void* allocate(
size_t size) {
538 if (m_allocations.capacity() == 0) m_allocations.reserve(8);
541 const int num_allocations =
static_cast<int>(m_allocations.size());
542 const bool has_allocation = m_allocation_index < num_allocations;
545 eigen_assert(m_allocation_index <= num_allocations);
552 if (has_allocation && m_allocations[m_allocation_index].size < size) {
553 m_device.deallocate(m_allocations[m_allocation_index].ptr);
554 m_allocations[m_allocation_index].ptr = m_device.allocate(size);
555 m_allocations[m_allocation_index].size = size;
559 if (!has_allocation) {
560 Allocation allocation;
561 allocation.ptr = m_device.allocate(size);
562 allocation.size = size;
563 m_allocations.push_back(allocation);
566 eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
567 eigen_assert(m_allocations[m_allocation_index].size >= size);
569 return m_allocations[m_allocation_index++].ptr;
572 void reset() { m_allocation_index = 0; }
580 const Device& m_device;
581 int m_allocation_index;
583 std::vector<Allocation> m_allocations;
589 enum TensorBlockKind {
601 kMaterializedInScratch,
610 kMaterializedInOutput
617 class TensorBlockNotImplemented {
619 typedef void XprType;
627 template <
typename XprType>
629 typedef typename XprType::Scalar type;
632 struct XprScalar<void> {
654 template <
typename Scalar,
int NumDims,
int Layout,
656 class TensorMaterializedBlock {
658 typedef DSizes<IndexType, NumDims> Dimensions;
659 typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
661 TensorMaterializedBlock(TensorBlockKind kind,
const Scalar* data,
662 const Dimensions& dimensions,
bool valid_expr =
true)
665 m_dimensions(dimensions),
666 m_expr(m_data, m_dimensions),
667 m_valid_expr(valid_expr) {
668 eigen_assert(m_kind == internal::TensorBlockKind::kView ||
669 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
670 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
673 TensorBlockKind kind()
const {
return m_kind; }
677 const XprType& expr()
const {
678 eigen_assert(m_valid_expr);
681 const Scalar* data()
const {
return m_data; }
684 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
696 Scalar* data()
const {
return m_data; }
697 const Dimensions& dimensions()
const {
return m_dimensions; }
698 const Dimensions& strides()
const {
return m_strides; }
700 TensorMaterializedBlock AsTensorMaterializedBlock()
const {
701 return TensorMaterializedBlock(
702 m_materialized_in_output
703 ? internal::TensorBlockKind::kMaterializedInOutput
704 : internal::TensorBlockKind::kMaterializedInScratch,
705 m_data, m_dimensions, !m_strided_storage);
709 friend class TensorMaterializedBlock;
711 Storage(Scalar* data,
const Dimensions& dimensions,
712 const Dimensions& strides,
bool materialized_in_output,
713 bool strided_storage)
715 m_dimensions(dimensions),
717 m_materialized_in_output(materialized_in_output),
718 m_strided_storage(strided_storage) {}
721 Dimensions m_dimensions;
722 Dimensions m_strides;
723 bool m_materialized_in_output;
724 bool m_strided_storage;
729 template <
typename TensorBlockScratch>
730 EIGEN_STRONG_INLINE
static Storage prepareStorage(
731 TensorBlockDesc& desc, TensorBlockScratch& scratch,
732 bool allow_strided_storage =
false) {
734 typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
736 if (desc.destination().kind() == DestinationBuffer::kContiguous) {
737 Scalar* buffer = desc.destination().template data<Scalar>();
738 desc.DropDestinationBuffer();
739 return Storage(buffer, desc.dimensions(),
740 internal::strides<Layout>(desc.dimensions()),
744 }
else if (desc.destination().kind() == DestinationBuffer::kStrided &&
745 allow_strided_storage) {
746 Scalar* buffer = desc.destination().template data<Scalar>();
747 desc.DropDestinationBuffer();
748 return Storage(buffer, desc.dimensions(), desc.destination().strides(),
752 void* mem = scratch.allocate(desc.size() *
sizeof(Scalar));
753 return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
754 internal::strides<Layout>(desc.dimensions()),
761 template <
typename DataDimensions,
typename TensorBlockScratch>
762 EIGEN_STRONG_INLINE
static TensorMaterializedBlock materialize(
763 const Scalar* data,
const DataDimensions& data_dims,
764 TensorBlockDesc& desc, TensorBlockScratch& scratch) {
765 eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
777 static const bool is_col_major = Layout ==
ColMajor;
780 int num_matching_inner_dims = 0;
781 for (
int i = 0; i < NumDims; ++i) {
782 int dim = is_col_major ? i : NumDims - i - 1;
783 if (data_dims[dim] != desc.dimensions()[dim])
break;
784 ++num_matching_inner_dims;
789 bool can_use_direct_access =
true;
790 for (
int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
791 int dim = is_col_major ? i : NumDims - i - 1;
792 if (desc.dimension(dim) != 1) {
793 can_use_direct_access =
false;
798 if (can_use_direct_access) {
799 const Scalar* block_start = data + desc.offset();
800 return TensorMaterializedBlock(internal::TensorBlockKind::kView,
801 block_start, desc.dimensions());
805 const Storage storage = prepareStorage(desc, scratch);
807 typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
809 typedef typename TensorBlockIO::Dst TensorBlockIODst;
810 typedef typename TensorBlockIO::Src TensorBlockIOSrc;
812 TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
813 data, desc.offset());
814 TensorBlockIODst dst(storage.dimensions(), storage.strides(),
817 TensorBlockIO::Copy(dst, src);
818 return storage.AsTensorMaterializedBlock();
823 TensorBlockKind m_kind;
824 const Scalar* m_data;
825 Dimensions m_dimensions;
834 template <
typename UnaryOp,
typename ArgTensorBlock>
835 class TensorCwiseUnaryBlock {
836 static const bool NoArgBlockAccess =
837 internal::is_void<typename ArgTensorBlock::XprType>::value;
840 typedef typename conditional<
841 NoArgBlockAccess, void,
842 TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
845 typedef typename XprScalar<XprType>::type Scalar;
847 TensorCwiseUnaryBlock(
const ArgTensorBlock& arg_block,
const UnaryOp& functor)
848 : m_arg_block(arg_block), m_functor(functor) {}
850 TensorBlockKind kind()
const {
return internal::TensorBlockKind::kExpr; }
852 XprType expr()
const {
return XprType(m_arg_block.expr(), m_functor); }
853 const Scalar* data()
const {
return NULL; }
854 void cleanup() { m_arg_block.cleanup(); }
857 ArgTensorBlock m_arg_block;
865 template <
typename BinaryOp,
typename LhsTensorBlock,
typename RhsTensorBlock>
866 class TensorCwiseBinaryBlock {
867 static const bool NoArgBlockAccess =
868 internal::is_void<typename LhsTensorBlock::XprType>::value ||
869 internal::is_void<typename RhsTensorBlock::XprType>::value;
872 typedef typename conditional<
873 NoArgBlockAccess, void,
874 TensorCwiseBinaryOp<BinaryOp,
const typename LhsTensorBlock::XprType,
875 const typename RhsTensorBlock::XprType> >::type
878 typedef typename XprScalar<XprType>::type Scalar;
880 TensorCwiseBinaryBlock(
const LhsTensorBlock& left_block,
881 const RhsTensorBlock& right_block,
882 const BinaryOp& functor)
883 : m_left_block(left_block),
884 m_right_block(right_block),
885 m_functor(functor) {}
887 TensorBlockKind kind()
const {
return internal::TensorBlockKind::kExpr; }
889 XprType expr()
const {
890 return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
893 const Scalar* data()
const {
return NULL; }
896 m_left_block.cleanup();
897 m_right_block.cleanup();
901 LhsTensorBlock m_left_block;
902 RhsTensorBlock m_right_block;
911 template <
typename BlockFactory,
typename ArgTensorBlock>
912 class TensorUnaryExprBlock {
913 typedef typename ArgTensorBlock::XprType ArgXprType;
914 static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
917 typedef typename conditional<
918 NoArgBlockAccess, void,
919 typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
921 typedef typename XprScalar<XprType>::type Scalar;
923 TensorUnaryExprBlock(
const ArgTensorBlock& arg_block,
924 const BlockFactory& factory)
925 : m_arg_block(arg_block), m_factory(factory) {}
927 TensorBlockKind kind()
const {
return internal::TensorBlockKind::kExpr; }
928 XprType expr()
const {
return m_factory.expr(m_arg_block.expr()); }
929 const Scalar* data()
const {
return NULL; }
930 void cleanup() { m_arg_block.cleanup(); }
933 ArgTensorBlock m_arg_block;
934 BlockFactory m_factory;
941 template <
typename BlockFactory,
typename Arg1TensorBlock,
942 typename Arg2TensorBlock,
typename Arg3TensorBlock>
943 class TensorTernaryExprBlock {
944 typedef typename Arg1TensorBlock::XprType Arg1XprType;
945 typedef typename Arg2TensorBlock::XprType Arg2XprType;
946 typedef typename Arg3TensorBlock::XprType Arg3XprType;
948 static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
949 internal::is_void<Arg2XprType>::value ||
950 internal::is_void<Arg3XprType>::value;
953 typedef typename conditional<
954 NoArgBlockAccess, void,
955 typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
956 Arg3XprType>::type>::type XprType;
958 typedef typename XprScalar<XprType>::type Scalar;
960 TensorTernaryExprBlock(
const Arg1TensorBlock& arg1_block,
961 const Arg2TensorBlock& arg2_block,
962 const Arg3TensorBlock& arg3_block,
963 const BlockFactory& factory)
964 : m_arg1_block(arg1_block),
965 m_arg2_block(arg2_block),
966 m_arg3_block(arg3_block),
967 m_factory(factory) {}
969 TensorBlockKind kind()
const {
return internal::TensorBlockKind::kExpr; }
970 XprType expr()
const {
971 return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
972 m_arg3_block.expr());
974 const Scalar* data()
const {
return NULL; }
976 m_arg1_block.cleanup();
977 m_arg2_block.cleanup();
978 m_arg3_block.cleanup();
982 Arg1TensorBlock m_arg1_block;
983 Arg2TensorBlock m_arg2_block;
984 Arg3TensorBlock m_arg3_block;
985 BlockFactory m_factory;
992 template <
typename Scalar,
typename IndexType>
993 class StridedLinearBufferCopy {
994 typedef typename packet_traits<Scalar>::type Packet;
996 Vectorizable = packet_traits<Scalar>::Vectorizable,
997 PacketSize = packet_traits<Scalar>::size
1012 Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
1020 Src(IndexType o, IndexType s,
const Scalar* d)
1021 : offset(o), stride(s), data(d) {}
1028 template <
typename Str
idedLinearBufferCopy::Kind kind>
1029 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void Run(
const Dst& dst,
1031 const size_t count) {
1032 Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
1037 template <
typename Str
idedLinearBufferCopy::Kind kind>
1038 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void Run(
1039 const IndexType count,
const IndexType dst_offset,
1040 const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
1041 const IndexType src_offset,
const IndexType src_stride,
1042 const Scalar* EIGEN_RESTRICT src_data) {
1043 const Scalar* src = &src_data[src_offset];
1044 Scalar* dst = &dst_data[dst_offset];
1046 if (!Vectorizable) {
1047 for (
Index i = 0; i < count; ++i) {
1048 dst[i * dst_stride] = src[i * src_stride];
1053 const IndexType vectorized_size = count - PacketSize;
1056 if (kind == StridedLinearBufferCopy::Kind::Linear) {
1059 const IndexType unrolled_size = count - 4 * PacketSize;
1060 eigen_assert(src_stride == 1 && dst_stride == 1);
1061 for (; i <= unrolled_size; i += 4 * PacketSize) {
1062 for (
int j = 0; j < 4; ++j) {
1063 Packet p = ploadu<Packet>(src + i + j * PacketSize);
1064 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1067 for (; i <= vectorized_size; i += PacketSize) {
1068 Packet p = ploadu<Packet>(src + i);
1069 pstoreu<Scalar, Packet>(dst + i, p);
1071 for (; i < count; ++i) {
1075 }
else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
1077 eigen_assert(src_stride == 1 && dst_stride != 1);
1078 for (; i <= vectorized_size; i += PacketSize) {
1079 Packet p = ploadu<Packet>(src + i);
1080 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1082 for (; i < count; ++i) {
1083 dst[i * dst_stride] = src[i];
1086 }
else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
1088 eigen_assert(src_stride == 0 && dst_stride == 1);
1089 const IndexType unrolled_size = count - 4 * PacketSize;
1090 Packet p = pload1<Packet>(src);
1091 for (; i <= unrolled_size; i += 4 * PacketSize) {
1092 for (
int j = 0; j < 4; ++j) {
1093 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1096 for (; i <= vectorized_size; i += PacketSize) {
1097 pstoreu<Scalar, Packet>(dst + i, p);
1099 for (; i < count; ++i) {
1103 }
else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
1105 eigen_assert(src_stride == 0 && dst_stride != 1);
1106 Packet p = pload1<Packet>(src);
1107 for (; i <= vectorized_size; i += PacketSize) {
1108 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1110 for (; i < count; ++i) {
1111 dst[i * dst_stride] = *src;
1114 }
else if (kind == StridedLinearBufferCopy::Kind::Gather) {
1116 eigen_assert(dst_stride == 1);
1117 for (; i <= vectorized_size; i += PacketSize) {
1118 Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
1119 pstoreu<Scalar, Packet>(dst + i, p);
1121 for (; i < count; ++i) {
1122 dst[i] = src[i * src_stride];
1125 }
else if (kind == StridedLinearBufferCopy::Kind::Random) {
1127 for (; i < count; ++i) {
1128 dst[i * dst_stride] = src[i * src_stride];
1131 eigen_assert(
false);
1142 template <
typename Scalar,
typename IndexType,
int NumDims,
int Layout>
1143 class TensorBlockIO {
1144 static const bool IsColMajor = (Layout ==
ColMajor);
1146 typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
1149 typedef DSizes<IndexType, NumDims> Dimensions;
1150 typedef DSizes<int, NumDims> DimensionsMap;
1153 Dst(
const Dimensions& dst_dims,
const Dimensions& dst_strides, Scalar* dst,
1154 IndexType dst_offset = 0)
1155 : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
1164 Src(
const Dimensions& src_strides,
const Scalar* src,
1165 IndexType src_offset = 0)
1166 : strides(src_strides), data(src), offset(src_offset) {}
1178 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
1179 const Dst& dst,
const Src& src,
const DimensionsMap& dst_to_src_dim_map) {
1182 *(dst.data + dst.offset) = *(src.data + src.offset);
1190 int inner_dim = IsColMajor ? 0 : NumDims - 1;
1191 EIGEN_UNUSED_VARIABLE(inner_dim);
1192 eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
1193 eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
1197 const DimensionsMap& dim_map = dst_to_src_dim_map;
1200 int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
1212 int num_size_one_inner_dims = 0;
1213 for (
int i = 0; i < num_squeezable_dims; ++i) {
1214 const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1215 if (dst.dims[dst_dim] != 1)
break;
1216 num_size_one_inner_dims++;
1220 if (num_size_one_inner_dims == NumDims) {
1221 *(dst.data + dst.offset) = *(src.data + src.offset);
1226 const int dst_stride1_dim = IsColMajor
1227 ? num_size_one_inner_dims
1228 : NumDims - num_size_one_inner_dims - 1;
1231 const int src_dim_for_dst_stride1_dim =
1232 NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
1235 IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
1239 for (
int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
1240 const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1241 const IndexType dst_stride = dst.strides[dst_dim];
1242 const IndexType src_stride = src.strides[dim_map[dst_dim]];
1243 if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
1244 dst_inner_dim_size *= dst.dims[dst_dim];
1245 ++num_size_one_inner_dims;
1252 IndexType input_offset = src.offset;
1253 IndexType output_offset = dst.offset;
1254 IndexType input_stride =
1255 NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
1256 IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
1258 const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
1259 array<BlockIteratorState, at_least_1_dim> it;
1263 for (
int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
1264 const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
1265 if (dst.dims[dst_dim] == 1)
continue;
1267 it[idx].size = dst.dims[dst_dim];
1268 it[idx].input_stride = src.strides[dim_map[dst_dim]];
1269 it[idx].output_stride = dst.strides[dst_dim];
1271 it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
1272 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1278 const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
1280 #define COPY_INNER_DIM(KIND) \ 1281 IndexType num_copied = 0; \ 1282 for (num_copied = 0; num_copied < block_total_size; \ 1283 num_copied += dst_inner_dim_size) { \ 1284 LinCopy::template Run<KIND>( \ 1285 typename LinCopy::Dst(output_offset, output_stride, dst.data), \ 1286 typename LinCopy::Src(input_offset, input_stride, src.data), \ 1287 dst_inner_dim_size); \ 1289 for (int j = 0; j < idx; ++j) { \ 1290 if (++it[j].count < it[j].size) { \ 1291 input_offset += it[j].input_stride; \ 1292 output_offset += it[j].output_stride; \ 1296 input_offset -= it[j].input_span; \ 1297 output_offset -= it[j].output_span; \ 1302 if (input_stride == 1 && output_stride == 1) {
1303 COPY_INNER_DIM(LinCopy::Kind::Linear);
1304 }
else if (input_stride == 1 && output_stride != 1) {
1305 COPY_INNER_DIM(LinCopy::Kind::Scatter);
1306 }
else if (input_stride == 0 && output_stride == 1) {
1307 COPY_INNER_DIM(LinCopy::Kind::FillLinear);
1308 }
else if (input_stride == 0 && output_stride != 1) {
1309 COPY_INNER_DIM(LinCopy::Kind::FillScatter);
1310 }
else if (output_stride == 1) {
1311 COPY_INNER_DIM(LinCopy::Kind::Gather);
1313 COPY_INNER_DIM(LinCopy::Kind::Random);
1316 #undef COPY_INNER_DIM 1321 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(
const Dst& dst,
1323 DimensionsMap dst_to_src_map;
1324 for (
int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
1325 return Copy(dst, src, dst_to_src_map);
1329 struct BlockIteratorState {
1330 BlockIteratorState()
1340 IndexType input_stride;
1341 IndexType output_stride;
1342 IndexType input_span;
1343 IndexType output_span;
1349 static int NumSqueezableInnerDims(
const DimensionsMap& dim_map) {
1350 int num_squeezable_dims = 0;
1351 for (
int i = 0; i < NumDims; ++i) {
1352 const int dim = IsColMajor ? i : NumDims - i - 1;
1353 if (dim_map[dim] != dim)
break;
1354 num_squeezable_dims++;
1356 return num_squeezable_dims;
1379 template <
typename Scalar,
int NumDims,
typename TensorBlockExpr,
1381 class TensorBlockAssignment {
1383 typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
1384 TensorBlockEvaluator;
1386 typedef DSizes<IndexType, NumDims> Dimensions;
1389 Vectorizable = packet_traits<Scalar>::Vectorizable,
1390 PacketSize = packet_traits<Scalar>::size
1393 template <
bool Vectorizable,
typename Evaluator>
1394 struct InnerDimAssign {
1395 EIGEN_ALWAYS_INLINE
static void Run(Scalar* target, IndexType count,
1396 const Evaluator& eval,
1397 IndexType eval_offset) {
1398 for (IndexType i = 0; i < count; ++i) {
1399 target[i] = eval.coeff(eval_offset + i);
1404 template <
typename Evaluator>
1405 struct InnerDimAssign<true, Evaluator> {
1406 EIGEN_ALWAYS_INLINE
static void Run(Scalar* target, IndexType count,
1407 const Evaluator& eval,
1408 IndexType eval_offset) {
1409 typedef typename packet_traits<Scalar>::type Packet;
1411 const IndexType unrolled_size = count - 4 * PacketSize;
1412 const IndexType vectorized_size = count - PacketSize;
1415 for (; i <= unrolled_size; i += 4 * PacketSize) {
1416 for (
int j = 0; j < 4; ++j) {
1417 const IndexType idx = eval_offset + i + j * PacketSize;
1418 Packet p = eval.template packet<Unaligned>(idx);
1419 pstoreu<Scalar>(target + i + j * PacketSize, p);
1423 for (; i <= vectorized_size; i += PacketSize) {
1424 Packet p = eval.template packet<Unaligned>(eval_offset + i);
1425 pstoreu<Scalar>(target + i, p);
1428 for (; i < count; ++i) {
1429 target[i] = eval.coeff(eval_offset + i);
1436 Target(
const Dimensions& target_dims,
const Dimensions& target_strides,
1437 Scalar* target_data, IndexType target_offset = 0)
1438 : dims(target_dims),
1439 strides(target_strides),
1441 offset(target_offset) {}
1449 static Target target(
const Dimensions& target_dims,
1450 const Dimensions& target_strides, Scalar* target_data,
1451 IndexType target_offset = 0) {
1452 return Target(target_dims, target_strides, target_data, target_offset);
1455 template <
typename TargetDimsIndexType,
typename TargetStr
idesIndexType>
1456 static Target target(
1457 const DSizes<TargetDimsIndexType, NumDims>& target_dims,
1458 const DSizes<TargetStridesIndexType, NumDims>& target_strides,
1459 Scalar* target_data, IndexType target_offset = 0) {
1461 return Target(Dimensions(target_dims), Dimensions(target_strides),
1462 target_data, target_offset);
1465 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void Run(
1466 const Target& target,
const TensorBlockExpr& expr) {
1468 DefaultDevice default_device;
1469 TensorBlockEvaluator eval(expr, default_device);
1472 eigen_assert(dimensions_match(target.dims, eval.dimensions()));
1474 static const int Layout = TensorBlockEvaluator::Layout;
1475 static const bool is_col_major = Layout ==
ColMajor;
1478 const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
1479 const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
1480 IndexType output_inner_dim_size = target.dims[inner_dim_idx];
1483 eigen_assert(target.strides[inner_dim_idx] == 1);
1486 IndexType num_squeezed_dims = 0;
1487 for (
Index i = 1; i < NumDims; ++i) {
1488 const Index dim = is_col_major ? i : NumDims - i - 1;
1489 const IndexType target_stride = target.strides[dim];
1491 if (output_inner_dim_size == target_stride) {
1492 output_inner_dim_size *= target.dims[dim];
1493 num_squeezed_dims++;
1501 array<BlockIteratorState, NumDims> it;
1504 for (
Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
1505 const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
1508 it[idx].size = target.dims[dim];
1509 it[idx].output_stride = target.strides[dim];
1510 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1516 IndexType input_offset = 0;
1517 IndexType output_offset = target.offset;
1520 for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
1522 InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
1523 TensorBlockEvaluator>::Run(target.data + output_offset,
1524 output_inner_dim_size, eval,
1528 input_offset += output_inner_dim_size;
1531 for (
int j = 0; j < idx; ++j) {
1532 if (++it[j].count < it[j].size) {
1533 output_offset += it[j].output_stride;
1537 output_offset -= it[j].output_span;
1543 struct BlockIteratorState {
1544 BlockIteratorState()
1545 : count(0), size(0), output_stride(0), output_span(0) {}
1549 IndexType output_stride;
1550 IndexType output_span;
1559 #endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index