webassembly - How correctly convert _mm_hadd_epi32 to equivalent C++ code (wasm_simd128.h) with Emscripten?

I have some code written with simd instructions to convert RGBA color data to grayscale data compiled to WASM thanks to Emscripten. It works fine and i hadn't any issues doing this. But i would make a step further, i would rewrite the code with the wasm_simd128.h header, in this code i have some lines invoking _mm_hadd_epi32, from Emscripten Simd docs they said: ⚠️ emulated with a SIMD add+two shuffles i converted the code with wasm_i32x4_add but the two shuffles it's something i don't completely understand. I post the original code arVideoLumaRGBAtoL_Intel_simd_asm and my translation to simd128 arVideoLumaRGBAtoL_Emscripten_simd128:

static void arVideoLumaRGBAtoL_Intel_simd_asm(uint8_t *__restrict dest,
                                              uint8_t *__restrict src,
                                              int32_t numPixels) {
  __m128i *pin = (__m128i *)src;
  uint32_t *pout = (uint32_t *)dest;
  int numPixelsDiv8 = numPixels >> 3;
  __m128i RGBScale = _mm_set_epi16(
      0, B8_CCIR601, G8_CCIR601, R8_CCIR601, 0, B8_CCIR601, G8_CCIR601,
      R8_CCIR601); // RGBScale =
                   // 000000[B8_CCIR601]00[G8_CCIR601]00[R8_CCIR601]000000[B8_CCIR601]00[G8_CCIR601]00[R8_CCIR601].

  do {
    __m128i pixels0_3 = _mm_load_si128(
        pin++); // pixels0_3 =
                // [A3][B3][G3][R3][A2][B2][G2][R2][A1][B1][G1][R1][A0][B0][G0][R0].
    __m128i pixels4_7 = _mm_load_si128(
        pin++); // pixels4_7 =
                // [A7][B7][G7][R7][A6][B6][G6][R6][A5][B5][G5][R5][A4][B4][G4][R4].

    __m128i pixels0_3_l = _mm_unpacklo_epi8(
        pixels0_3,
        _mm_setzero_si128()); // pixels0_3_l =
                              // 00[A1]00[B1]00[G1]00[R1]00[A0]00[B0]00[G0]00[R0].
    __m128i pixels0_3_h = _mm_unpackhi_epi8(
        pixels0_3,
        _mm_setzero_si128()); // pixels0_3_h =
                              // 00[A3]00[B3]00[G3]00[R3]00[A2]00[B2]00[G2]00[R2].
    __m128i pixels4_7_l = _mm_unpacklo_epi8(
        pixels4_7,
        _mm_setzero_si128()); // pixels4_7_l =
                              // 00[A5]00[B5]00[G5]00[R5]00[A4]00[B4]00[G4]00[R4].
    __m128i pixels4_7_h = _mm_unpackhi_epi8(
        pixels4_7,
        _mm_setzero_si128()); // pixels4_7_h =
                              // 00[A7]00[B7]00[G7]00[R7]00[A6]00[B6]00[G6]00[R6].

    __m128i y0_3_l = _mm_madd_epi16(pixels0_3_l, RGBScale);
    __m128i y0_3_h = _mm_madd_epi16(pixels0_3_h, RGBScale);
    __m128i y4_7_l = _mm_madd_epi16(pixels4_7_l, RGBScale);
    __m128i y4_7_h = _mm_madd_epi16(pixels4_7_h, RGBScale);
    __m128i y0_3 = _mm_hadd_epi32(y0_3_l, y0_3_h);
    __m128i y4_7 = _mm_hadd_epi32(y4_7_l, y4_7_h);

    y0_3 = _mm_srli_epi32(y0_3, 8);
    y4_7 = _mm_srli_epi32(y4_7, 8);
    y0_3 = _mm_packs_epi32(y0_3, y0_3);
    y4_7 = _mm_packs_epi32(y4_7, y4_7);
    y0_3 = _mm_packus_epi16(y0_3, y0_3);
    y4_7 = _mm_packus_epi16(y4_7, y4_7);

    *pout++ = _mm_cvtsi128_si32(y0_3);
    *pout++ = _mm_cvtsi128_si32(y4_7);

    numPixelsDiv8--;
  } while (numPixelsDiv8);
}

static void arVideoLumaRGBAtoL_Emscripten_simd128(uint8_t *__restrict dest,
                                                  uint8_t *__restrict src,
                                                  int32_t numPixels) {
  v128_t RGBScale = wasm_i16x8_make(
      0, B8_CCIR601, G8_CCIR601, R8_CCIR601, 0, B8_CCIR601, G8_CCIR601,
      R8_CCIR601); // RGBScale = [0, B8_CCIR601, G8_CCIR601, R8_CCIR601]
  int numPixelsDiv8 = numPixels >> 3;

  for (int i = 0; i < numPixelsDiv8; i++) {
    v128_t pixels0_3 = wasm_v128_load(src); // Load 16 bytes (4 pixels) from src
    v128_t pixels4_7 =
        wasm_v128_load(src + 16); // Load next 16 bytes (4 pixels) from src

    // Unpack and interleave the low and high bytes of each 16-byte lane
    v128_t pixels0_3_l =
        wasm_i16x8_shuffle(pixels0_3, pixels0_3, 0, 2, 4, 6, 8, 10, 12, 14);
    v128_t pixels0_3_h =
        wasm_i16x8_shuffle(pixels0_3, pixels0_3, 1, 3, 5, 7, 9, 11, 13, 15);
    v128_t pixels4_7_l =
        wasm_i16x8_shuffle(pixels4_7, pixels4_7, 0, 2, 4, 6, 8, 10, 12, 14);
    v128_t pixels4_7_h =
        wasm_i16x8_shuffle(pixels4_7, pixels4_7, 1, 3, 5, 7, 9, 11, 13, 15);

    // Multiply and add the RGB components
    v128_t y0_3_l = wasm_i32x4_dot_i16x8(pixels0_3_l, RGBScale);
    v128_t y0_3_h = wasm_i32x4_dot_i16x8(pixels0_3_h, RGBScale);
    v128_t y4_7_l = wasm_i32x4_dot_i16x8(pixels4_7_l, RGBScale);
    v128_t y4_7_h = wasm_i32x4_dot_i16x8(pixels4_7_h, RGBScale);

    // Horizontal add the result
    v128_t y0_3 = wasm_i32x4_add(y0_3_l, y0_3_h);
    v128_t y4_7 = wasm_i32x4_add(y4_7_l, y4_7_h);

    // Shift right by 8 bits to divide by 256
    y0_3 = wasm_u32x4_shr(y0_3, 8);
    y4_7 = wasm_u32x4_shr(y4_7, 8);

    // Pack the 32-bit results into 16-bit and then into 8-bit values
    y0_3 = wasm_i16x8_narrow_i32x4(y0_3, y0_3);
    y4_7 = wasm_i16x8_narrow_i32x4(y4_7, y4_7);
    y0_3 = wasm_u8x16_narrow_i16x8(y0_3, y0_3);
    y4_7 = wasm_u8x16_narrow_i16x8(y4_7, y4_7);

    // Store the result back to dest
    wasm_v128_store(dest, y0_3);
    wasm_v128_store(dest + 16, y4_7);

    src += 32;
    dest += 8;
  }
}

i think the wasm_i32x4_add it's not enough, the code is built but the result is a washed image instead of a gray one.

Post Edit: my code is hosted on github see my PR

static void arVideoLumaRGBAtoL_Intel_simd_asm(uint8_t *__restrict dest,
                                              uint8_t *__restrict src,
                                              int32_t numPixels) {
  __m128i *pin = (__m128i *)src;
  uint32_t *pout = (uint32_t *)dest;
  int numPixelsDiv8 = numPixels >> 3;
  __m128i RGBScale = _mm_set_epi16(
      0, B8_CCIR601, G8_CCIR601, R8_CCIR601, 0, B8_CCIR601, G8_CCIR601,
      R8_CCIR601); // RGBScale =
                   // 000000[B8_CCIR601]00[G8_CCIR601]00[R8_CCIR601]000000[B8_CCIR601]00[G8_CCIR601]00[R8_CCIR601].

  do {
    __m128i pixels0_3 = _mm_load_si128(
        pin++); // pixels0_3 =
                // [A3][B3][G3][R3][A2][B2][G2][R2][A1][B1][G1][R1][A0][B0][G0][R0].
    __m128i pixels4_7 = _mm_load_si128(
        pin++); // pixels4_7 =
                // [A7][B7][G7][R7][A6][B6][G6][R6][A5][B5][G5][R5][A4][B4][G4][R4].

    __m128i pixels0_3_l = _mm_unpacklo_epi8(
        pixels0_3,
        _mm_setzero_si128()); // pixels0_3_l =
                              // 00[A1]00[B1]00[G1]00[R1]00[A0]00[B0]00[G0]00[R0].
    __m128i pixels0_3_h = _mm_unpackhi_epi8(
        pixels0_3,
        _mm_setzero_si128()); // pixels0_3_h =
                              // 00[A3]00[B3]00[G3]00[R3]00[A2]00[B2]00[G2]00[R2].
    __m128i pixels4_7_l = _mm_unpacklo_epi8(
        pixels4_7,
        _mm_setzero_si128()); // pixels4_7_l =
                              // 00[A5]00[B5]00[G5]00[R5]00[A4]00[B4]00[G4]00[R4].
    __m128i pixels4_7_h = _mm_unpackhi_epi8(
        pixels4_7,
        _mm_setzero_si128()); // pixels4_7_h =
                              // 00[A7]00[B7]00[G7]00[R7]00[A6]00[B6]00[G6]00[R6].

    __m128i y0_3_l = _mm_madd_epi16(pixels0_3_l, RGBScale);
    __m128i y0_3_h = _mm_madd_epi16(pixels0_3_h, RGBScale);
    __m128i y4_7_l = _mm_madd_epi16(pixels4_7_l, RGBScale);
    __m128i y4_7_h = _mm_madd_epi16(pixels4_7_h, RGBScale);
    __m128i y0_3 = _mm_hadd_epi32(y0_3_l, y0_3_h);
    __m128i y4_7 = _mm_hadd_epi32(y4_7_l, y4_7_h);

    y0_3 = _mm_srli_epi32(y0_3, 8);
    y4_7 = _mm_srli_epi32(y4_7, 8);
    y0_3 = _mm_packs_epi32(y0_3, y0_3);
    y4_7 = _mm_packs_epi32(y4_7, y4_7);
    y0_3 = _mm_packus_epi16(y0_3, y0_3);
    y4_7 = _mm_packus_epi16(y4_7, y4_7);

    *pout++ = _mm_cvtsi128_si32(y0_3);
    *pout++ = _mm_cvtsi128_si32(y4_7);

    numPixelsDiv8--;
  } while (numPixelsDiv8);
}

static void arVideoLumaRGBAtoL_Emscripten_simd128(uint8_t *__restrict dest,
                                                  uint8_t *__restrict src,
                                                  int32_t numPixels) {
  v128_t RGBScale = wasm_i16x8_make(
      0, B8_CCIR601, G8_CCIR601, R8_CCIR601, 0, B8_CCIR601, G8_CCIR601,
      R8_CCIR601); // RGBScale = [0, B8_CCIR601, G8_CCIR601, R8_CCIR601]
  int numPixelsDiv8 = numPixels >> 3;

  for (int i = 0; i < numPixelsDiv8; i++) {
    v128_t pixels0_3 = wasm_v128_load(src); // Load 16 bytes (4 pixels) from src
    v128_t pixels4_7 =
        wasm_v128_load(src + 16); // Load next 16 bytes (4 pixels) from src

    // Unpack and interleave the low and high bytes of each 16-byte lane
    v128_t pixels0_3_l =
        wasm_i16x8_shuffle(pixels0_3, pixels0_3, 0, 2, 4, 6, 8, 10, 12, 14);
    v128_t pixels0_3_h =
        wasm_i16x8_shuffle(pixels0_3, pixels0_3, 1, 3, 5, 7, 9, 11, 13, 15);
    v128_t pixels4_7_l =
        wasm_i16x8_shuffle(pixels4_7, pixels4_7, 0, 2, 4, 6, 8, 10, 12, 14);
    v128_t pixels4_7_h =
        wasm_i16x8_shuffle(pixels4_7, pixels4_7, 1, 3, 5, 7, 9, 11, 13, 15);

    // Multiply and add the RGB components
    v128_t y0_3_l = wasm_i32x4_dot_i16x8(pixels0_3_l, RGBScale);
    v128_t y0_3_h = wasm_i32x4_dot_i16x8(pixels0_3_h, RGBScale);
    v128_t y4_7_l = wasm_i32x4_dot_i16x8(pixels4_7_l, RGBScale);
    v128_t y4_7_h = wasm_i32x4_dot_i16x8(pixels4_7_h, RGBScale);

    // Horizontal add the result
    v128_t y0_3 = wasm_i32x4_add(y0_3_l, y0_3_h);
    v128_t y4_7 = wasm_i32x4_add(y4_7_l, y4_7_h);

    // Shift right by 8 bits to divide by 256
    y0_3 = wasm_u32x4_shr(y0_3, 8);
    y4_7 = wasm_u32x4_shr(y4_7, 8);

    // Pack the 32-bit results into 16-bit and then into 8-bit values
    y0_3 = wasm_i16x8_narrow_i32x4(y0_3, y0_3);
    y4_7 = wasm_i16x8_narrow_i32x4(y4_7, y4_7);
    y0_3 = wasm_u8x16_narrow_i16x8(y0_3, y0_3);
    y4_7 = wasm_u8x16_narrow_i16x8(y4_7, y4_7);

    // Store the result back to dest
    wasm_v128_store(dest, y0_3);
    wasm_v128_store(dest + 16, y4_7);

    src += 32;
    dest += 8;
  }
}

i think the wasm_i32x4_add it's not enough, the code is built but the result is a washed image instead of a gray one.

Post Edit: my code is hosted on github see my PR

Share Improve this question edited Jan 29 at 19:56 asked Jan 29 at 16:49 kalwalt 4924 silver badges14 bronze badges

1 Well you do need the shuffles, you can't just leave them out. – user555045 Commented Jan 29 at 16:56
hi @user555045 thanks for the answer, do you see something that is not correct with my code? – kalwalt Commented Jan 29 at 17:21
1 Yes, wasm_i32x4_add is a vertical add, like _mm_add_epi32 not hadd. – Peter Cordes Commented Jan 29 at 17:36
Github Copilot suggested to create a hadd_epi32 custom function: v128_t hadd_epi32(v128_t a, v128_t b) { v128_t shuffled_a = wasm_v32x4_shuffle(a, b, 2, 0, 6, 4); v128_t shuffled_b = wasm_v32x4_shuffle(a, b, 3, 1, 7, 5); return wasm_i32x4_add(shuffled_a, shuffled_b); } based on the _mm_hadd_epi32 defined in the system/include/compat/tmmintrin.h file . – kalwalt Commented Jan 29 at 18:10
1 If that's not how wasm_v32x4_shuffle works, then it's AI hallucination. – Peter Cordes Commented Jan 29 at 18:46

| Show 4 more comments

1 Answer 1

Sorted by: Reset to default 2

I can’t help you with webassembly SIMD. But note that on AMD64, _mm_hadd_epi32 instruction is relatively slow, it decodes into 3-4 microops depending on the CPU.

It’s possible to refactor your algorithm eliminating the need for _mm_hadd_epi32. Don’t use _mm_unpacklo_epi8 / _mm_unpackhi_epi8, leave the bytes in the correct uint32_t lanes of these vectors. If you do that, you don’t need to add int32 numbers pairwise, the vertical sum instruction _mm_add_epi32 is very fast on all processors.

static void arVideoLumaRGBAtoL_Intel_simd_asm( uint8_t* __restrict dest,
    uint8_t* __restrict src, int32_t numPixels )
{
    __m128i* pin = ( __m128i* )src;
    int64_t* pout = (int64_t*)dest;
    int numPixelsDiv8 = numPixels / 8;

    __m128i maskRedBlue = _mm_set1_epi32( 0x00FF00FF );
    __m128i scaleRedBlue = _mm_set1_epi32( (uint32_t)B8_CCIR601 << 16 | R8_CCIR601 );
    __m128i scaleGreen = _mm_set1_epi32( G8_CCIR601 );

    do
    {
        __m128i pixels1 = _mm_load_si128( pin );
        __m128i pixels2 = _mm_load_si128( pin + 1 );
        pin += 2;

        // Shifting uint16 lanes by 8 bits leaves 0 in the higher bytes, no need to mask
        __m128i g1 = _mm_srli_epi16( pixels1, 8 );
        __m128i g2 = _mm_srli_epi16( pixels2, 8 );

        // For red and blue channels, bitwise AND with the mask of 0x00FF00FF to isolate
        __m128i rb1 = _mm_and_si128( pixels1, maskRedBlue );
        __m128i rb2 = _mm_and_si128( pixels2, maskRedBlue );

        // Scale the numbers, and add pairwise
        g1 = _mm_madd_epi16( g1, scaleGreen );
        g2 = _mm_madd_epi16( g2, scaleGreen );
        rb1 = _mm_madd_epi16( rb1, scaleRedBlue );
        rb2 = _mm_madd_epi16( rb2, scaleRedBlue );

        // We now have them in int32 lanes in the correct order, add vertically
        __m128i y1 = _mm_add_epi32( g1, rb1 );
        __m128i y2 = _mm_add_epi32( g2, rb2 );

        // Divide by 256
        y1 = _mm_srli_epi32( y1, 8 );
        y2 = _mm_srli_epi32( y2, 8 );

        // Pack 32-bit lanes into unsigned bytes, with saturation
        __m128i y = _mm_packs_epi32( y1, y2 );
        y = _mm_packus_epi16( y, y );

        // Store 8 bytes with 1 instruction
        *pout = _mm_cvtsi128_si64( y );
        pout++;
        numPixelsDiv8--;
    }
    while( numPixelsDiv8 );
}

The code is untested.

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

webassembly - How correctly convert _mm_hadd_epi32 to equivalent C++ code (wasm_simd128.h) with Emscripten? - Stack Overflow

1 Answer 1

与本文相关的文章

评论列表(0)