assembly - C Loop Unrolling very inefficient (Clang, GCC,...)

Why do the compilers produce inefficient loop unrolling.

GCC Version 14.2
Clang Version 19.1.0
Compile args for both: -Ofast -lm

Example code:

#define MIN(a,b) ((a) < (b) ? (a) : (b))
void to_test(size_t num_iters)
{
    const size_t exec_iters = MIN(num_iters, 10);

    #if __GNUC__
    #pragma GCC unroll 10
    #elif __clang__
    #pragma unroll 10
    #endif
    for (size_t i = 0; i < exec_iters; ++i) {
        printf("%d\n", i);
    }
}

In this case, what both compilers do, is duplicate the code (unrolling). and after every execution it does a conditional jump to see if its finished gcc Godbolt for gcc variant 1 clang Godbolt for clang variant 1

So both do basically this, which is quite suboptimal.

to_test:
        test    rdi, rdi
        je      .L33
        push    rbx
        xor     esi, esi
        mov     rbx, rdi
        xor     eax, eax
        mov     edi, OFFSET FLAT:.LC0
        call    printf
        cmp     rbx, 1
        je      .L1
        xor     eax, eax
        mov     esi, 1
        mov     edi, OFFSET FLAT:.LC0
        call    printf
        cmp     rbx, 2
        je      .L1
        xor     eax, eax
        mov     esi, 2
        mov     edi, OFFSET FLAT:.LC0
        call    printf
        cmp     rbx, 3
        je      .L1
        xor     eax, eax
        mov     esi, 3
        mov     edi, OFFSET FLAT:.LC0
        call    printf
        cmp     rbx, 4
        je      .L1
        xor     eax, eax
        mov     esi, 4
        mov     edi, OFFSET FLAT:.LC0
        call    printf
        cmp     rbx, 5
        je      .L1
        xor     eax, eax
        mov     esi, 5
        mov     edi, OFFSET FLAT:.LC0
        call    printf
        cmp     rbx, 6
        je      .L1
        xor     eax, eax
        mov     esi, 6
        mov     edi, OFFSET FLAT:.LC0
        call    printf
        cmp     rbx, 7
        je      .L1
        xor     eax, eax
        mov     esi, 7
        mov     edi, OFFSET FLAT:.LC0
        call    printf
        cmp     rbx, 8
        je      .L1
        xor     eax, eax
        mov     esi, 8
        mov     edi, OFFSET FLAT:.LC0
        call    printf
        cmp     rbx, 9
        je      .L1
        mov     esi, 9
        mov     edi, OFFSET FLAT:.LC0
        xor     eax, eax
        pop     rbx
        jmp     printf

Now if we slighly modify the code to

void to_test(size_t num_iters)
{
    const size_t exec_iters = fmin(num_iters, 10);

    #if __GNUC__
    #pragma GCC unroll 10
    #elif __clang__
    #pragma unroll 10
    #endif
    for (size_t i = 0; i < exec_iters; ++i) {
        printf("%d\n", i);
    }
}

We see them producing different code. GCC Godbolt for gcc variant 2 performs a linear search thus lots of comparisons.

.L5:
        test    r12, r12
        je      .L1
        mov     rax, r12
        xor     ebx, ebx
        and     eax, 7
        je      .L7
        cmp     rax, 1
        je      .L32
        cmp     rax, 2
        je      .L33
        cmp     rax, 3
        je      .L34
        cmp     rax, 4
        je      .L35
        cmp     rax, 5
        je      .L36
        cmp     rax, 6
        jne     .L49

Clang makes it even weirder. It first checks if the value results in 10. if this is the case, it will use the fully unrolled loop (no if checks afterwards). But if its smaller, it doesn't do any loop unrolling and just performs the ordinary loop. Clang Godbolt for clang variant 2

        cmp     r12, 10
        jb      .LBB0_4
        sub     r12, r15
        lea     r14, [rip + .L.str]
        xor     ebx, ebx
.LBB0_3:
        mov     rdi, r14
        mov     rsi, rbx
        xor     eax, eax
        call    printf@PLT
        lea     rsi, [rbx + 1]
        mov     rdi, r14
        xor     eax, eax
        call    printf@PLT
        lea     rsi, [rbx + 2]
        mov     rdi, r14
        xor     eax, eax
        call    printf@PLT
        lea     rsi, [rbx + 3]
        mov     rdi, r14
        xor     eax, eax
        call    printf@PLT
        lea     rsi, [rbx + 4]
        mov     rdi, r14
        xor     eax, eax
        call    printf@PLT
        lea     rsi, [rbx + 5]
        mov     rdi, r14
        xor     eax, eax
        call    printf@PLT
        lea     rsi, [rbx + 6]
        mov     rdi, r14
        xor     eax, eax
        call    printf@PLT
        lea     rsi, [rbx + 7]
        mov     rdi, r14
        xor     eax, eax
        call    printf@PLT
        lea     rsi, [rbx + 8]
        mov     rdi, r14
        xor     eax, eax
        call    printf@PLT
        lea     rsi, [rbx + 9]
        mov     rdi, r14
        xor     eax, eax
        call    printf@PLT
        add     rbx, 10
        cmp     rbx, r12
        jne     .LBB0_3
.LBB0_4:
        cmp     r13, 10
        je      .LBB0_7
        lea     r14, [rip + .L.str]
.LBB0_6:
        mov     rdi, r14
        mov     rsi, rbx
        xor     eax, eax
        call    printf@PLT
        inc     rbx
        dec     r15
        jne     .LBB0_6

Final question, these variants all seem quite ineffective. Especially the linear search for the correct value by GCC or clang not doing any unrolling at all in the second case.

Is it possible to let the compiler do something more clever. For example something like this (so it calcs an initial start_index) (it has the max number of possible iterations) and then jump directly to the final location.

// assumption (exec_iters = 4)
// then start_idx = 4 - 10 = -6
const start_idx = -6;
jump label_fabs({start_idx})

:label_0
printf("%d\n", start_idx + 0);     // i = 0
:label_1
printf("%d\n", start_idx + 1);     // i = 1
:label_2
printf("%d\n", start_idx + 2);     // i = 2
:label_3
printf("%d\n", start_idx + 3);     // i = 3
:label_4
printf("%d\n", start_idx + 4);     // i = 4
:label_5
printf("%d\n", start_idx + 5);     // i = 5
:label_6
printf("%d\n", start_idx + 6);     // i = 6
:label_7
printf("%d\n", start_idx + 7);     // i = 7
:label_8
printf("%d\n", start_idx + 8);     // i = 8
:label_9
printf("%d\n", start_idx + 9);     // i = 9

Or even building 10 functions and jump directly to the one with the exact number of iterations.

Even fixing the end of the loop and having a variable start (which should be easier for the compiler to optimize), does not result in something more clever.

#define MIN(a,b) ((a) < (b) ? (a) : (b))
void to_test(size_t num_iters)
{
    const size_t exec_iters = MIN(num_iters, 10);

    int loop_start = 10 - exec_iters;
    int base_offset = exec_iters - 10;
    int running_offset = loop_start;

    #if __GNUC__
    #pragma GCC unroll 10
    #elif __clang__
    #pragma unroll 10
    #endif
    for (size_t i = loop_start; i < 10; ++i) {
        printf("%d\n", base_offset + running_offset);
        ++running_offset;
    }
}

GCC Version 14.2 Clang Version 19.1.0 Compile args for both: -Ofast -lm

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

assembly - C Loop Unrolling very inefficient (Clang, GCC,...) - Stack Overflow

与本文相关的文章

评论列表(0)