Change CUDA DOT thread-blocks to 1024
This improves the performance on Ampere (A100) GPUs. Fixes #137.
This commit is contained in:
parent
893af9f5d0
commit
092ee67764
@ -22,7 +22,7 @@
|
||||
#endif
|
||||
|
||||
#define TBSIZE 1024
|
||||
#define DOT_NUM_BLOCKS 256
|
||||
#define DOT_NUM_BLOCKS 1024
|
||||
|
||||
template <class T>
|
||||
class CUDAStream : public Stream<T>
|
||||
|
||||
Loading…
Reference in New Issue
Block a user