I have some code written with simd instructions to convert RGBA color data to grayscale data compiled to WASM thanks to Emscripten. It works fine and i hadn't any issues doing this. But i would make a step further, i would rewrite the code with the wasm_simd128.h header, in this code i have some lines invoking _mm_hadd_epi32
, from Emscripten Simd docs they said: ⚠️ emulated with a SIMD add+two shuffles
i converted the code with wasm_i32x4_add
but the two shuffles it's something i don't completely understand. I post the original code arVideoLumaRGBAtoL_Intel_simd_asm
and my translation to simd128 arVideoLumaRGBAtoL_Emscripten_simd128
:
static void arVideoLumaRGBAtoL_Intel_simd_asm(uint8_t *__restrict dest,
uint8_t *__restrict src,
int32_t numPixels) {
__m128i *pin = (__m128i *)src;
uint32_t *pout = (uint32_t *)dest;
int numPixelsDiv8 = numPixels >> 3;
__m128i RGBScale = _mm_set_epi16(
0, B8_CCIR601, G8_CCIR601, R8_CCIR601, 0, B8_CCIR601, G8_CCIR601,
R8_CCIR601); // RGBScale =
// 000000[B8_CCIR601]00[G8_CCIR601]00[R8_CCIR601]000000[B8_CCIR601]00[G8_CCIR601]00[R8_CCIR601].
do {
__m128i pixels0_3 = _mm_load_si128(
pin++); // pixels0_3 =
// [A3][B3][G3][R3][A2][B2][G2][R2][A1][B1][G1][R1][A0][B0][G0][R0].
__m128i pixels4_7 = _mm_load_si128(
pin++); // pixels4_7 =
// [A7][B7][G7][R7][A6][B6][G6][R6][A5][B5][G5][R5][A4][B4][G4][R4].
__m128i pixels0_3_l = _mm_unpacklo_epi8(
pixels0_3,
_mm_setzero_si128()); // pixels0_3_l =
// 00[A1]00[B1]00[G1]00[R1]00[A0]00[B0]00[G0]00[R0].
__m128i pixels0_3_h = _mm_unpackhi_epi8(
pixels0_3,
_mm_setzero_si128()); // pixels0_3_h =
// 00[A3]00[B3]00[G3]00[R3]00[A2]00[B2]00[G2]00[R2].
__m128i pixels4_7_l = _mm_unpacklo_epi8(
pixels4_7,
_mm_setzero_si128()); // pixels4_7_l =
// 00[A5]00[B5]00[G5]00[R5]00[A4]00[B4]00[G4]00[R4].
__m128i pixels4_7_h = _mm_unpackhi_epi8(
pixels4_7,
_mm_setzero_si128()); // pixels4_7_h =
// 00[A7]00[B7]00[G7]00[R7]00[A6]00[B6]00[G6]00[R6].
__m128i y0_3_l = _mm_madd_epi16(pixels0_3_l, RGBScale);
__m128i y0_3_h = _mm_madd_epi16(pixels0_3_h, RGBScale);
__m128i y4_7_l = _mm_madd_epi16(pixels4_7_l, RGBScale);
__m128i y4_7_h = _mm_madd_epi16(pixels4_7_h, RGBScale);
__m128i y0_3 = _mm_hadd_epi32(y0_3_l, y0_3_h);
__m128i y4_7 = _mm_hadd_epi32(y4_7_l, y4_7_h);
y0_3 = _mm_srli_epi32(y0_3, 8);
y4_7 = _mm_srli_epi32(y4_7, 8);
y0_3 = _mm_packs_epi32(y0_3, y0_3);
y4_7 = _mm_packs_epi32(y4_7, y4_7);
y0_3 = _mm_packus_epi16(y0_3, y0_3);
y4_7 = _mm_packus_epi16(y4_7, y4_7);
*pout++ = _mm_cvtsi128_si32(y0_3);
*pout++ = _mm_cvtsi128_si32(y4_7);
numPixelsDiv8--;
} while (numPixelsDiv8);
}
static void arVideoLumaRGBAtoL_Emscripten_simd128(uint8_t *__restrict dest,
uint8_t *__restrict src,
int32_t numPixels) {
v128_t RGBScale = wasm_i16x8_make(
0, B8_CCIR601, G8_CCIR601, R8_CCIR601, 0, B8_CCIR601, G8_CCIR601,
R8_CCIR601); // RGBScale = [0, B8_CCIR601, G8_CCIR601, R8_CCIR601]
int numPixelsDiv8 = numPixels >> 3;
for (int i = 0; i < numPixelsDiv8; i++) {
v128_t pixels0_3 = wasm_v128_load(src); // Load 16 bytes (4 pixels) from src
v128_t pixels4_7 =
wasm_v128_load(src + 16); // Load next 16 bytes (4 pixels) from src
// Unpack and interleave the low and high bytes of each 16-byte lane
v128_t pixels0_3_l =
wasm_i16x8_shuffle(pixels0_3, pixels0_3, 0, 2, 4, 6, 8, 10, 12, 14);
v128_t pixels0_3_h =
wasm_i16x8_shuffle(pixels0_3, pixels0_3, 1, 3, 5, 7, 9, 11, 13, 15);
v128_t pixels4_7_l =
wasm_i16x8_shuffle(pixels4_7, pixels4_7, 0, 2, 4, 6, 8, 10, 12, 14);
v128_t pixels4_7_h =
wasm_i16x8_shuffle(pixels4_7, pixels4_7, 1, 3, 5, 7, 9, 11, 13, 15);
// Multiply and add the RGB components
v128_t y0_3_l = wasm_i32x4_dot_i16x8(pixels0_3_l, RGBScale);
v128_t y0_3_h = wasm_i32x4_dot_i16x8(pixels0_3_h, RGBScale);
v128_t y4_7_l = wasm_i32x4_dot_i16x8(pixels4_7_l, RGBScale);
v128_t y4_7_h = wasm_i32x4_dot_i16x8(pixels4_7_h, RGBScale);
// Horizontal add the result
v128_t y0_3 = wasm_i32x4_add(y0_3_l, y0_3_h);
v128_t y4_7 = wasm_i32x4_add(y4_7_l, y4_7_h);
// Shift right by 8 bits to divide by 256
y0_3 = wasm_u32x4_shr(y0_3, 8);
y4_7 = wasm_u32x4_shr(y4_7, 8);
// Pack the 32-bit results into 16-bit and then into 8-bit values
y0_3 = wasm_i16x8_narrow_i32x4(y0_3, y0_3);
y4_7 = wasm_i16x8_narrow_i32x4(y4_7, y4_7);
y0_3 = wasm_u8x16_narrow_i16x8(y0_3, y0_3);
y4_7 = wasm_u8x16_narrow_i16x8(y4_7, y4_7);
// Store the result back to dest
wasm_v128_store(dest, y0_3);
wasm_v128_store(dest + 16, y4_7);
src += 32;
dest += 8;
}
}
i think the wasm_i32x4_add
it's not enough, the code is built but the result is a washed image instead of a gray one.
Post Edit: my code is hosted on github see my PR
I have some code written with simd instructions to convert RGBA color data to grayscale data compiled to WASM thanks to Emscripten. It works fine and i hadn't any issues doing this. But i would make a step further, i would rewrite the code with the wasm_simd128.h header, in this code i have some lines invoking _mm_hadd_epi32
, from Emscripten Simd docs they said: ⚠️ emulated with a SIMD add+two shuffles
i converted the code with wasm_i32x4_add
but the two shuffles it's something i don't completely understand. I post the original code arVideoLumaRGBAtoL_Intel_simd_asm
and my translation to simd128 arVideoLumaRGBAtoL_Emscripten_simd128
:
static void arVideoLumaRGBAtoL_Intel_simd_asm(uint8_t *__restrict dest,
uint8_t *__restrict src,
int32_t numPixels) {
__m128i *pin = (__m128i *)src;
uint32_t *pout = (uint32_t *)dest;
int numPixelsDiv8 = numPixels >> 3;
__m128i RGBScale = _mm_set_epi16(
0, B8_CCIR601, G8_CCIR601, R8_CCIR601, 0, B8_CCIR601, G8_CCIR601,
R8_CCIR601); // RGBScale =
// 000000[B8_CCIR601]00[G8_CCIR601]00[R8_CCIR601]000000[B8_CCIR601]00[G8_CCIR601]00[R8_CCIR601].
do {
__m128i pixels0_3 = _mm_load_si128(
pin++); // pixels0_3 =
// [A3][B3][G3][R3][A2][B2][G2][R2][A1][B1][G1][R1][A0][B0][G0][R0].
__m128i pixels4_7 = _mm_load_si128(
pin++); // pixels4_7 =
// [A7][B7][G7][R7][A6][B6][G6][R6][A5][B5][G5][R5][A4][B4][G4][R4].
__m128i pixels0_3_l = _mm_unpacklo_epi8(
pixels0_3,
_mm_setzero_si128()); // pixels0_3_l =
// 00[A1]00[B1]00[G1]00[R1]00[A0]00[B0]00[G0]00[R0].
__m128i pixels0_3_h = _mm_unpackhi_epi8(
pixels0_3,
_mm_setzero_si128()); // pixels0_3_h =
// 00[A3]00[B3]00[G3]00[R3]00[A2]00[B2]00[G2]00[R2].
__m128i pixels4_7_l = _mm_unpacklo_epi8(
pixels4_7,
_mm_setzero_si128()); // pixels4_7_l =
// 00[A5]00[B5]00[G5]00[R5]00[A4]00[B4]00[G4]00[R4].
__m128i pixels4_7_h = _mm_unpackhi_epi8(
pixels4_7,
_mm_setzero_si128()); // pixels4_7_h =
// 00[A7]00[B7]00[G7]00[R7]00[A6]00[B6]00[G6]00[R6].
__m128i y0_3_l = _mm_madd_epi16(pixels0_3_l, RGBScale);
__m128i y0_3_h = _mm_madd_epi16(pixels0_3_h, RGBScale);
__m128i y4_7_l = _mm_madd_epi16(pixels4_7_l, RGBScale);
__m128i y4_7_h = _mm_madd_epi16(pixels4_7_h, RGBScale);
__m128i y0_3 = _mm_hadd_epi32(y0_3_l, y0_3_h);
__m128i y4_7 = _mm_hadd_epi32(y4_7_l, y4_7_h);
y0_3 = _mm_srli_epi32(y0_3, 8);
y4_7 = _mm_srli_epi32(y4_7, 8);
y0_3 = _mm_packs_epi32(y0_3, y0_3);
y4_7 = _mm_packs_epi32(y4_7, y4_7);
y0_3 = _mm_packus_epi16(y0_3, y0_3);
y4_7 = _mm_packus_epi16(y4_7, y4_7);
*pout++ = _mm_cvtsi128_si32(y0_3);
*pout++ = _mm_cvtsi128_si32(y4_7);
numPixelsDiv8--;
} while (numPixelsDiv8);
}
static void arVideoLumaRGBAtoL_Emscripten_simd128(uint8_t *__restrict dest,
uint8_t *__restrict src,
int32_t numPixels) {
v128_t RGBScale = wasm_i16x8_make(
0, B8_CCIR601, G8_CCIR601, R8_CCIR601, 0, B8_CCIR601, G8_CCIR601,
R8_CCIR601); // RGBScale = [0, B8_CCIR601, G8_CCIR601, R8_CCIR601]
int numPixelsDiv8 = numPixels >> 3;
for (int i = 0; i < numPixelsDiv8; i++) {
v128_t pixels0_3 = wasm_v128_load(src); // Load 16 bytes (4 pixels) from src
v128_t pixels4_7 =
wasm_v128_load(src + 16); // Load next 16 bytes (4 pixels) from src
// Unpack and interleave the low and high bytes of each 16-byte lane
v128_t pixels0_3_l =
wasm_i16x8_shuffle(pixels0_3, pixels0_3, 0, 2, 4, 6, 8, 10, 12, 14);
v128_t pixels0_3_h =
wasm_i16x8_shuffle(pixels0_3, pixels0_3, 1, 3, 5, 7, 9, 11, 13, 15);
v128_t pixels4_7_l =
wasm_i16x8_shuffle(pixels4_7, pixels4_7, 0, 2, 4, 6, 8, 10, 12, 14);
v128_t pixels4_7_h =
wasm_i16x8_shuffle(pixels4_7, pixels4_7, 1, 3, 5, 7, 9, 11, 13, 15);
// Multiply and add the RGB components
v128_t y0_3_l = wasm_i32x4_dot_i16x8(pixels0_3_l, RGBScale);
v128_t y0_3_h = wasm_i32x4_dot_i16x8(pixels0_3_h, RGBScale);
v128_t y4_7_l = wasm_i32x4_dot_i16x8(pixels4_7_l, RGBScale);
v128_t y4_7_h = wasm_i32x4_dot_i16x8(pixels4_7_h, RGBScale);
// Horizontal add the result
v128_t y0_3 = wasm_i32x4_add(y0_3_l, y0_3_h);
v128_t y4_7 = wasm_i32x4_add(y4_7_l, y4_7_h);
// Shift right by 8 bits to divide by 256
y0_3 = wasm_u32x4_shr(y0_3, 8);
y4_7 = wasm_u32x4_shr(y4_7, 8);
// Pack the 32-bit results into 16-bit and then into 8-bit values
y0_3 = wasm_i16x8_narrow_i32x4(y0_3, y0_3);
y4_7 = wasm_i16x8_narrow_i32x4(y4_7, y4_7);
y0_3 = wasm_u8x16_narrow_i16x8(y0_3, y0_3);
y4_7 = wasm_u8x16_narrow_i16x8(y4_7, y4_7);
// Store the result back to dest
wasm_v128_store(dest, y0_3);
wasm_v128_store(dest + 16, y4_7);
src += 32;
dest += 8;
}
}
i think the wasm_i32x4_add
it's not enough, the code is built but the result is a washed image instead of a gray one.
Post Edit: my code is hosted on github see my PR
Share Improve this question edited Jan 29 at 19:56 kalwalt asked Jan 29 at 16:49 kalwaltkalwalt 4924 silver badges14 bronze badges 9 | Show 4 more comments1 Answer
Reset to default 2I can’t help you with webassembly SIMD. But note that on AMD64, _mm_hadd_epi32
instruction is relatively slow, it decodes into 3-4 microops depending on the CPU.
It’s possible to refactor your algorithm eliminating the need for _mm_hadd_epi32
. Don’t use _mm_unpacklo_epi8
/ _mm_unpackhi_epi8
, leave the bytes in the correct uint32_t
lanes of these vectors. If you do that, you don’t need to add int32 numbers pairwise, the vertical sum instruction _mm_add_epi32
is very fast on all processors.
static void arVideoLumaRGBAtoL_Intel_simd_asm( uint8_t* __restrict dest,
uint8_t* __restrict src, int32_t numPixels )
{
__m128i* pin = ( __m128i* )src;
int64_t* pout = (int64_t*)dest;
int numPixelsDiv8 = numPixels / 8;
__m128i maskRedBlue = _mm_set1_epi32( 0x00FF00FF );
__m128i scaleRedBlue = _mm_set1_epi32( (uint32_t)B8_CCIR601 << 16 | R8_CCIR601 );
__m128i scaleGreen = _mm_set1_epi32( G8_CCIR601 );
do
{
__m128i pixels1 = _mm_load_si128( pin );
__m128i pixels2 = _mm_load_si128( pin + 1 );
pin += 2;
// Shifting uint16 lanes by 8 bits leaves 0 in the higher bytes, no need to mask
__m128i g1 = _mm_srli_epi16( pixels1, 8 );
__m128i g2 = _mm_srli_epi16( pixels2, 8 );
// For red and blue channels, bitwise AND with the mask of 0x00FF00FF to isolate
__m128i rb1 = _mm_and_si128( pixels1, maskRedBlue );
__m128i rb2 = _mm_and_si128( pixels2, maskRedBlue );
// Scale the numbers, and add pairwise
g1 = _mm_madd_epi16( g1, scaleGreen );
g2 = _mm_madd_epi16( g2, scaleGreen );
rb1 = _mm_madd_epi16( rb1, scaleRedBlue );
rb2 = _mm_madd_epi16( rb2, scaleRedBlue );
// We now have them in int32 lanes in the correct order, add vertically
__m128i y1 = _mm_add_epi32( g1, rb1 );
__m128i y2 = _mm_add_epi32( g2, rb2 );
// Divide by 256
y1 = _mm_srli_epi32( y1, 8 );
y2 = _mm_srli_epi32( y2, 8 );
// Pack 32-bit lanes into unsigned bytes, with saturation
__m128i y = _mm_packs_epi32( y1, y2 );
y = _mm_packus_epi16( y, y );
// Store 8 bytes with 1 instruction
*pout = _mm_cvtsi128_si64( y );
pout++;
numPixelsDiv8--;
}
while( numPixelsDiv8 );
}
The code is untested.
wasm_i32x4_add
is a vertical add, like_mm_add_epi32
nothadd
. – Peter Cordes Commented Jan 29 at 17:36v128_t hadd_epi32(v128_t a, v128_t b) { v128_t shuffled_a = wasm_v32x4_shuffle(a, b, 2, 0, 6, 4); v128_t shuffled_b = wasm_v32x4_shuffle(a, b, 3, 1, 7, 5); return wasm_i32x4_add(shuffled_a, shuffled_b); }
based on the_mm_hadd_epi32
defined in the system/include/compat/tmmintrin.h file . – kalwalt Commented Jan 29 at 18:10wasm_v32x4_shuffle
works, then it's AI hallucination. – Peter Cordes Commented Jan 29 at 18:46