10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H 41 template<
typename Expression>
42 struct ExpressionHasTensorBroadcastingOp {
43 enum { value =
false };
46 template<
typename LhsXprType,
typename RhsXprType>
47 struct ExpressionHasTensorBroadcastingOp<
48 const TensorAssignOp<LhsXprType, RhsXprType> > {
49 enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
52 template<
typename UnaryOp,
typename XprType>
53 struct ExpressionHasTensorBroadcastingOp<
54 const TensorCwiseUnaryOp<UnaryOp, XprType> > {
55 enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
58 template<
typename BinaryOp,
typename LhsXprType,
typename RhsXprType>
59 struct ExpressionHasTensorBroadcastingOp<
60 const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
62 value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||
63 ExpressionHasTensorBroadcastingOp<RhsXprType>::value
67 template<
typename Broadcast,
typename XprType>
68 struct ExpressionHasTensorBroadcastingOp<
69 const TensorBroadcastingOp<Broadcast, XprType> > {
70 enum { value =
true };
79 template <
typename Expression,
typename Device,
bool Vectorizable,
80 TiledEvaluation Tiling>
83 typedef typename Expression::Index StorageIndex;
90 static_assert(std::is_same<Device, DefaultDevice>::value,
91 "Default executor instantiated with non-default device. " 92 "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or " 93 "EIGEN_USE_SYCL before including Eigen headers.");
96 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
97 const Device& device = Device()) {
98 TensorEvaluator<Expression, Device> evaluator(expr, device);
99 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
101 const StorageIndex size = array_prod(evaluator.dimensions());
102 for (StorageIndex i = 0; i < size; ++i) {
103 evaluator.evalScalar(i);
114 template <
typename Expression,
typename Device,
typename DoneCallback,
115 bool Vectorizable, TiledEvaluation Tiling>
116 class TensorAsyncExecutor {};
121 template <
typename Expression>
123 TiledEvaluation::Off> {
125 typedef typename Expression::Index StorageIndex;
128 static EIGEN_STRONG_INLINE
void run(
129 const Expression& expr,
const DefaultDevice& device = DefaultDevice()) {
130 TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
131 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
133 const StorageIndex size = array_prod(evaluator.dimensions());
134 const int PacketSize = unpacket_traits<
typename TensorEvaluator<
135 Expression, DefaultDevice>::PacketReturnType>::size;
140 const StorageIndex UnrolledSize =
141 (size / (4 * PacketSize)) * 4 * PacketSize;
142 for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
143 for (StorageIndex j = 0; j < 4; j++) {
144 evaluator.evalPacket(i + j * PacketSize);
147 const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
148 for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
149 evaluator.evalPacket(i);
151 for (StorageIndex i = VectorizedSize; i < size; ++i) {
152 evaluator.evalScalar(i);
163 template <
typename Expression,
bool Vectorizable>
165 TiledEvaluation::On> {
167 typedef typename traits<Expression>::Scalar Scalar;
168 typedef typename remove_const<Scalar>::type ScalarNoConst;
170 typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
171 typedef typename traits<Expression>::Index StorageIndex;
173 static const int NumDims = traits<Expression>::NumDimensions;
176 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
177 const DefaultDevice& device = DefaultDevice()) {
178 typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
181 typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
183 typedef internal::TensorBlockScratchAllocator<DefaultDevice>
186 Evaluator evaluator(expr, device);
189 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
193 const TensorBlockResourceRequirements requirements =
194 evaluator.getResourceRequirements();
196 const TensorBlockMapper block_mapper(
197 typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
201 TensorBlockScratch scratch(device);
203 const StorageIndex total_block_count = block_mapper.blockCount();
204 for (StorageIndex i = 0; i < total_block_count; ++i) {
205 TensorBlockDesc desc = block_mapper.blockDescriptor(i);
206 evaluator.evalBlock(desc, scratch);
225 #ifdef EIGEN_USE_THREADS 227 template <
typename TensorBlockMapper>
228 struct TensorExecutorTilingContext {
229 TensorExecutorTilingContext() =
default;
230 TensorExecutorTilingContext(
const TensorBlockMapper& b_mapper,
231 const TensorOpCost& b_cost,
size_t b_aligned_size)
232 : block_mapper(b_mapper),
234 aligned_blocksize(b_aligned_size) {}
236 TensorBlockMapper block_mapper;
238 size_t aligned_blocksize;
243 template <
typename Evaluator,
typename TensorBlockMapper,
bool Vectorizable>
244 TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
245 const Evaluator& evaluator) {
247 TensorBlockResourceRequirements requirements =
248 evaluator.getResourceRequirements();
251 double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
252 1, requirements.cost_per_coeff);
253 requirements.size =
static_cast<size_t>(1.0 / taskSize);
255 TensorBlockMapper block_mapper(
256 typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
259 size_t block_size = block_mapper.blockTotalSize();
260 const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
261 const size_t aligned_blocksize =
263 divup<size_t>(block_size *
sizeof(
typename Evaluator::Scalar), align);
265 return {block_mapper, requirements.cost_per_coeff * block_size,
269 template <
typename Evaluator,
typename StorageIndex,
bool Vectorizable>
271 static void run(Evaluator* evaluator_in,
const StorageIndex firstIdx,
272 const StorageIndex lastIdx) {
273 Evaluator evaluator = *evaluator_in;
274 eigen_assert(lastIdx >= firstIdx);
275 for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
276 evaluator.evalScalar(i);
280 static StorageIndex alignBlockSize(StorageIndex size) {
return size; }
283 template <
typename Evaluator,
typename StorageIndex>
284 struct EvalRange<Evaluator, StorageIndex, true> {
285 static const int PacketSize =
286 unpacket_traits<typename Evaluator::PacketReturnType>::size;
288 static void run(Evaluator* evaluator_in,
const StorageIndex firstIdx,
289 const StorageIndex lastIdx) {
290 Evaluator evaluator = *evaluator_in;
291 eigen_assert(lastIdx >= firstIdx);
292 StorageIndex i = firstIdx;
293 if (lastIdx - firstIdx >= PacketSize) {
294 eigen_assert(firstIdx % PacketSize == 0);
295 StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
299 for (; i <= last_chunk_offset; i += 4 * PacketSize) {
300 for (StorageIndex j = 0; j < 4; j++) {
301 evaluator.evalPacket(i + j * PacketSize);
304 last_chunk_offset = lastIdx - PacketSize;
305 for (; i <= last_chunk_offset; i += PacketSize) {
306 evaluator.evalPacket(i);
309 for (; i < lastIdx; ++i) {
310 evaluator.evalScalar(i);
314 static StorageIndex alignBlockSize(StorageIndex size) {
316 if (size >= 16 * PacketSize) {
317 return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
320 return (size + PacketSize - 1) & ~(PacketSize - 1);
324 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
325 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
327 typedef typename Expression::Index StorageIndex;
329 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
330 const ThreadPoolDevice& device) {
331 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
332 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
334 Evaluator evaluator(expr, device);
335 const bool needs_assign = evaluator.evalSubExprsIfNeeded(
nullptr);
337 const StorageIndex size = array_prod(evaluator.dimensions());
338 device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
339 EvalRange::alignBlockSize,
340 [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
341 EvalRange::run(&evaluator, firstIdx, lastIdx);
348 template <
typename Expression,
bool Vectorizable>
350 TiledEvaluation::On> {
352 typedef typename traits<Expression>::Index IndexType;
353 typedef typename traits<Expression>::Scalar Scalar;
354 typedef typename remove_const<Scalar>::type ScalarNoConst;
356 static const int NumDims = traits<Expression>::NumDimensions;
358 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
359 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
360 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
362 typedef internal::TensorBlockDescriptor<NumDims, IndexType>
364 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
367 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
368 const ThreadPoolDevice& device) {
369 Evaluator evaluator(expr, device);
371 const bool needs_assign = evaluator.evalSubExprsIfNeeded(
nullptr);
373 const TilingContext tiling =
374 internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
375 Vectorizable>(evaluator);
377 auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
378 IndexType lastBlockIdx) {
379 TensorBlockScratch scratch(device);
381 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
383 TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
384 evaluator.evalBlock(desc, scratch);
390 if (tiling.block_mapper.blockCount() == 1) {
391 TensorBlockScratch scratch(device);
392 TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
393 evaluator.evalBlock(desc, scratch);
395 device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
403 template <
typename Expression,
typename DoneCallback,
bool Vectorizable,
404 TiledEvaluation Tiling>
405 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
406 Vectorizable, Tiling> {
408 typedef typename Expression::Index StorageIndex;
409 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
411 static EIGEN_STRONG_INLINE
void runAsync(
const Expression& expr,
412 const ThreadPoolDevice& device,
414 TensorAsyncExecutorContext*
const ctx =
415 new TensorAsyncExecutorContext(expr, device, std::move(done));
417 const auto on_eval_subexprs = [ctx, &device](
bool need_assign) ->
void {
423 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
424 const StorageIndex size = array_prod(ctx->evaluator.dimensions());
425 device.parallelForAsync(
426 size, ctx->evaluator.costPerCoeff(Vectorizable),
427 EvalRange::alignBlockSize,
428 [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
429 EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
431 [ctx]() {
delete ctx; });
434 ctx->evaluator.evalSubExprsIfNeededAsync(
nullptr, on_eval_subexprs);
438 struct TensorAsyncExecutorContext {
439 TensorAsyncExecutorContext(
const Expression& expr,
440 const ThreadPoolDevice& thread_pool,
442 : evaluator(expr, thread_pool), on_done(
std::move(done)) {}
444 ~TensorAsyncExecutorContext() {
452 DoneCallback on_done;
456 template <
typename Expression,
typename DoneCallback,
bool Vectorizable>
457 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
458 Vectorizable, TiledEvaluation::On> {
460 typedef typename traits<Expression>::Index IndexType;
461 typedef typename traits<Expression>::Scalar Scalar;
462 typedef typename remove_const<Scalar>::type ScalarNoConst;
464 static const int NumDims = traits<Expression>::NumDimensions;
466 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
467 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
468 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
470 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
471 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
474 static EIGEN_STRONG_INLINE
void runAsync(
const Expression& expr,
475 const ThreadPoolDevice& device,
478 TensorAsyncExecutorContext*
const ctx =
479 new TensorAsyncExecutorContext(expr, device, std::move(done));
481 const auto on_eval_subexprs = [ctx](
bool need_assign) ->
void {
487 ctx->tiling = internal::GetTensorExecutorTilingContext<
488 Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
490 auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
491 TensorBlockScratch scratch(ctx->device);
493 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
495 TensorBlockDesc desc =
496 ctx->tiling.block_mapper.blockDescriptor(block_idx);
497 ctx->evaluator.evalBlock(desc, scratch);
503 if (ctx->tiling.block_mapper.blockCount() == 1) {
504 TensorBlockScratch scratch(ctx->device);
505 TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
506 ctx->evaluator.evalBlock(desc, scratch);
509 ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
510 ctx->tiling.cost, eval_block,
511 [ctx]() {
delete ctx; });
515 ctx->evaluator.evalSubExprsIfNeededAsync(
nullptr, on_eval_subexprs);
519 struct TensorAsyncExecutorContext {
520 TensorAsyncExecutorContext(
const Expression& expr,
521 const ThreadPoolDevice& thread_pool,
523 : device(thread_pool),
524 evaluator(expr, thread_pool),
525 on_done(
std::move(done)) {}
527 ~TensorAsyncExecutorContext() {
532 const ThreadPoolDevice& device;
534 TilingContext tiling;
537 DoneCallback on_done;
541 #endif // EIGEN_USE_THREADS 544 #if defined(EIGEN_USE_GPU) 546 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
547 class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
549 typedef typename Expression::Index StorageIndex;
550 static void run(
const Expression& expr,
const GpuDevice& device);
553 #if defined(EIGEN_GPUCC) 554 template <
typename Evaluator,
typename StorageIndex,
bool Vectorizable>
555 struct EigenMetaKernelEval {
556 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
557 void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
558 for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
564 template <
typename Evaluator,
typename StorageIndex>
565 struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
566 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
567 void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
568 const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
569 const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
570 const StorageIndex vectorized_step_size = step_size * PacketSize;
573 for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
574 i += vectorized_step_size) {
577 for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
583 template <
typename Evaluator,
typename StorageIndex>
585 __launch_bounds__(1024)
586 EigenMetaKernel(Evaluator eval, StorageIndex size) {
588 const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
589 const StorageIndex step_size = blockDim.x * gridDim.x;
591 const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
592 EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
596 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
598 const Expression& expr,
const GpuDevice& device) {
599 TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
600 const bool needs_assign = evaluator.evalSubExprsIfNeeded(
nullptr);
603 const int block_size = device.maxGpuThreadsPerBlock();
604 const int max_blocks = device.getNumGpuMultiProcessors() *
605 device.maxGpuThreadsPerMultiProcessor() / block_size;
606 const StorageIndex size = array_prod(evaluator.dimensions());
608 const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
611 (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
612 num_blocks, block_size, 0, device, evaluator, size);
617 #endif // EIGEN_GPUCC 618 #endif // EIGEN_USE_GPU 621 #ifdef EIGEN_USE_SYCL 623 template <
typename Evaluator>
624 struct ExecExprFunctorKernel {
625 typedef typename Evaluator::Index
Index;
628 template <
typename Scratch>
629 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
630 const Scratch, Evaluator evaluator_,
const Index range_)
631 : evaluator(evaluator_), range(range_) {}
633 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void operator()(
634 cl::sycl::nd_item<1> itemID) {
637 template <
bool is_vec = Evaluator::PacketAccess>
638 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename std::enable_if<!is_vec>::type
639 compute(
const cl::sycl::nd_item<1>& itemID) {
640 Index gId =
static_cast<Index
>(itemID.get_global_linear_id());
641 Index total_threads = itemID.get_global_range(0);
643 for (Index i = gId; i < range; i += total_threads) {
644 evaluator.evalScalar(i);
647 template <
bool is_vec = Evaluator::PacketAccess>
648 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename std::enable_if<is_vec>::type
649 compute(
const cl::sycl::nd_item<1>& itemID) {
650 const Index vectorizedRange =
651 (range / Evaluator::PacketSize) * Evaluator::PacketSize;
652 Index gId =
static_cast<Index
>(itemID.get_global_linear_id());
653 const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
654 const Index start = Evaluator::PacketSize * gId;
655 for (Index i = start; i < vectorizedRange; i += step) {
656 evaluator.evalPacket(i);
658 gId += vectorizedRange;
659 for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
660 evaluator.evalScalar(i);
665 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
668 typedef typename Expression::Index
Index;
669 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
670 const Eigen::SyclDevice& dev) {
672 Evaluator evaluator(expr, dev);
673 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
675 Index range, GRange, tileSize;
676 Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
677 total_size = (total_size == 0) ? 1 : total_size;
678 const int PacketSize =
679 Eigen::PacketType<
typename Evaluator::CoeffReturnType,
680 Eigen::SyclDevice>::size;
681 Index vectorizable_threads =
static_cast<Index
>(total_size / PacketSize);
682 dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
685 dev.template nullary_kernel_launcher<
686 typename Evaluator::CoeffReturnType,
687 ExecExprFunctorKernel<Evaluator> >(
689 cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
690 cl::sycl::range<1>(tileSize)),
703 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H Namespace containing all symbols from the Eigen library.
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:28
Definition: AutoDiffScalar.h:718
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The tensor executor class.