I try to write this simple C function for AVX2 horizontal sum in assembly.
static inline float hsum256_ps(__m256 v) {
__m128 lo = _mm256_castps256_ps128(v);
__m128 hi = _mm256_extractf128_ps(v, 1);
__m128 sum128 = _mm_add_ps(lo, hi);
__m128 shuf = _mm_movehdup_ps(sum128);
__m128 sums = _mm_add_ps(sum128, shuf);
shuf = _mm_movehl_ps(shuf, sums);
sums = _mm_add_ss(sums, shuf);
return _mm_cvtss_f32(sums);
}
I came up with
static inline float hsum256_ps(__m256 v){
float r;
__asm__ __volatile__ (
"vextractf128 $0, %1, %%xmm0 \n\t"
"vextractf128 $1, %1, %%xmm1 \n\t"
"vaddps %%xmm0, %%xmm0, %%xmm1 \n\t"
"movshdup %%xmm0, %%xmm1 \n\t"
"vaddps %%xmm0, %%xmm0, %%xmm1 \n\t"
"movhlps %%xmm0, %%xmm1 \n\t"
"vaddss %%xmm0, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm0, %0 \n\t"
: "=m"(r)
: "x"(v)
: "xmm0", "xmm1"
);
return r;
}
but it does not seem to work. Where did I do wrong?