Change CUDA DOT thread-blocks to 1024

This improves the performance on Ampere (A100) GPUs.

Fixes #137.
This commit is contained in:
Tom Deakin 2023-06-12 15:49:59 +01:00
parent 893af9f5d0
commit 092ee67764

View File

@ -22,7 +22,7 @@
#endif #endif
#define TBSIZE 1024 #define TBSIZE 1024
#define DOT_NUM_BLOCKS 256 #define DOT_NUM_BLOCKS 1024
template <class T> template <class T>
class CUDAStream : public Stream<T> class CUDAStream : public Stream<T>