I am trying to speed up C++ row-major matrix multiplication on Android, but the SIMD instructions I implemented seem to be far from ideal and they fail to outperform the computation time of a naive implementation (I tested that with Samsung S21 and Xiaomi Poco F1). I'd be really grateful if you could point out what I am doing wrong. Thanks!
Here are minimal representations of my Matrix44f and Vector4f classes:
class alignas(16) Matrix44f {
public:
Vector4f mRows[4];
};
class alignas(16) Vector4f {
public:
float x, y, z, w;
};
The SIMD implementation:
Matrix44f operator*(const Matrix44f& pM1, const Matrix44f& pM2)
{
Matrix44f matRes;
float32x4_t m2_row0 = vld1q_f32(&pM2.mRows[0][0]);
float32x4_t m2_row1 = vld1q_f32(&pM2.mRows[1][0]);
float32x4_t m2_row2 = vld1q_f32(&pM2.mRows[2][0]);
float32x4_t m2_row3 = vld1q_f32(&pM2.mRows[3][0]);
float32x4_t m1_row, result;
for (int i = 0; i < 4; ++i)
{
m1_row = vld1q_f32(&pM1.mRows[i][0]);
result = vmulq_n_f32(m2_row0, vgetq_lane_f32(m1_row, 0));
result = vfmaq_n_f32(result, m2_row1, vgetq_lane_f32(m1_row, 1));
result = vfmaq_n_f32(result, m2_row2, vgetq_lane_f32(m1_row, 2));
result = vfmaq_n_f32(result, m2_row3, vgetq_lane_f32(m1_row, 3));
vst1q_f32(&matRes.mRows[i][0], result);
}
return matRes;
}