The code is pretty simple
kernel void naive(
constant RunParams *param [[ buffer(0) ]],
const device float *A [[ buffer(1) ]], // [N, K]
device float *output [[ buffer(2) ]],
uint2 gid [[ thread_position_in_grid ]]) {
uint a_ptr = gid.x * param->K;
for (uint i = 0; i < param->K; i++, a_ptr++) {
val += A[b_ptr];
}
output[ptr] = val;
}
when uint a_ptr = gid.x * param->K
, the code got 150 GFLops
when uint a_ptr = gid.y * param->K
, the code got 860 GFLops
param->K = 256; thread per group: [16, 16]
I'd like to understand why the performance is so different, and how can I profile/diagnose this to help with further optimization.