|
#pragma once |
|
|
|
#include "cutlass/cutlass.h" |
|
#include <climits> |
|
|
|
|
|
|
|
|
|
#define CUTLASS_CHECK(status) \ |
|
{ \ |
|
TORCH_CHECK(status == cutlass::Status::kSuccess, \ |
|
cutlassGetStatusString(status)) \ |
|
} |
|
|
|
inline uint32_t next_pow_2(uint32_t const num) { |
|
if (num <= 1) return num; |
|
return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); |
|
} |
|
|
|
inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { |
|
int max_shared_mem_per_block_opt_in = 0; |
|
cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in, |
|
cudaDevAttrMaxSharedMemoryPerBlockOptin, |
|
device); |
|
return max_shared_mem_per_block_opt_in; |
|
} |
|
|
|
|