Change CUDA DOT thread-blocks to 1024
This improves the performance on Ampere (A100) GPUs. Fixes #137.
This commit is contained in:
parent
893af9f5d0
commit
092ee67764
@ -22,7 +22,7 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define TBSIZE 1024
|
#define TBSIZE 1024
|
||||||
#define DOT_NUM_BLOCKS 256
|
#define DOT_NUM_BLOCKS 1024
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
class CUDAStream : public Stream<T>
|
class CUDAStream : public Stream<T>
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user