#include #include #define A(i,j) A[(i) + (j)*lda] #define B(i,j) B[(i) + (j)*ldb] #define C(i,j) C[(i) + (j)*ldc] #define sa6(i,j) sa6[((j)<<5) + (i)] #define sb6(i,j) sb6[((i)<<5) + (j)] #define MS 32 #define NS 32 #define KS 32 // cache blocking version, without register-level data re-used // with memory coelascing on shared memory // more workloads per thread. 4x1 micro kernel. // adopt vetorized load/store __global__ __launch_bounds__(256) void mysgemm_v6(int M, int N, int K, float alpha, float* A, float* B, float beta, float* C){ int lda = M, ldb = K, ldc = M; int tx = threadIdx.x; int bx = blockIdx.x, by = blockIdx.y; int row1 = (tx&7)<<2, row2 = row1+1, row3 = row1+2, row4 = row1+3, col = tx>>3; A = &A((bx<<5),0); B = &B(0,(by<<5)); C = &C((bx<<5),(by<<5)); __shared__ float sa6[MS*KS]; __shared__ float sb6[KS*NS]; float4 Av, Bv, Cv, Cres; Cres.x = 0., Cres.y = 0., Cres.z = 0., Cres.w = 0.; float b00; for (int k_count = 0; k_count