#include #include #define A(i,j) A[(i) + (j)*lda] #define B(i,j) B[(i) + (j)*ldb] #define C(i,j) C[(i) + (j)*ldc] #define sa8(i,j) sa8[((j)<<7) + (i)] #define sb8(i,j) sb8[((j)<<7) + (i)] #define MS_8 128 #define NS_8 128 #define KS_8 8 //v1 += v2 * s3, vector scaling #define vscal(v1, v2, s3)\ v1.x+=v2.x*s3;\ v1.y+=v2.y*s3;\ v1.z+=v2.z*s3;\ v1.w+=v2.w*s3; //v1 = alpha * v2 + beta * v3, simd fma #define simd_axpby(v1, alpha, v2, beta, v3)\ v1.x=alpha*v2.x+beta*v3.x;\ v1.y=alpha*v2.y+beta*v3.y;\ v1.z=alpha*v2.z+beta*v3.z;\ v1.w=alpha*v2.w+beta*v3.w; #define vload(v1,addr)\ v1 = *((float4 *)(addr)); #define vstore(addr,v1)\ *((float4 *)(addr)) = v1; // cache blocking version, without register-level data re-use // with memory coelascing on shared memory // more workloads per thread. 8x8 micro kernel. // adopt vetorized load/store __global__ __launch_bounds__(256) void mysgemm_v8(int M, int N, int K, float alpha, float* A, float* B, float beta, float* C){ int lda = M, ldb = K, ldc = M; int tx = threadIdx.x; int bx = blockIdx.x, by = blockIdx.y; int row_a = (tx&31)<<2, col_a = tx>>5; int row_b = (tx&1)<<2, col_b = tx>>1; int lda8 = lda<<3; int row_c = (tx&15)<<3, col_c = (tx>>4)<<3; A = &A((bx<<7),0); B = &B(0,(by<<7)); C = &C((bx<<7),(by<<7));//the TB size is 128. __shared__ float sa8[1024]; __shared__ float sb8[1024]; float4 Av1, Av2, Bv1, Bv2, Cv[16], Cres[16]; memset(Cres, 0, sizeof(Cres));//clear registers for (int k_count = 0; k_count