Why do the compilers produce inefficient loop unrolling.
GCC Version 14.2
Clang Version 19.1.0
Compile args for both: -Ofast -lm
Example code:
#define MIN(a,b) ((a) < (b) ? (a) : (b))
void to_test(size_t num_iters)
{
const size_t exec_iters = MIN(num_iters, 10);
#if __GNUC__
#pragma GCC unroll 10
#elif __clang__
#pragma unroll 10
#endif
for (size_t i = 0; i < exec_iters; ++i) {
printf("%d\n", i);
}
}
In this case, what both compilers do, is duplicate the code (unrolling). and after every execution it does a conditional jump to see if its finished gcc Godbolt for gcc variant 1 clang Godbolt for clang variant 1
So both do basically this, which is quite suboptimal.
to_test:
test rdi, rdi
je .L33
push rbx
xor esi, esi
mov rbx, rdi
xor eax, eax
mov edi, OFFSET FLAT:.LC0
call printf
cmp rbx, 1
je .L1
xor eax, eax
mov esi, 1
mov edi, OFFSET FLAT:.LC0
call printf
cmp rbx, 2
je .L1
xor eax, eax
mov esi, 2
mov edi, OFFSET FLAT:.LC0
call printf
cmp rbx, 3
je .L1
xor eax, eax
mov esi, 3
mov edi, OFFSET FLAT:.LC0
call printf
cmp rbx, 4
je .L1
xor eax, eax
mov esi, 4
mov edi, OFFSET FLAT:.LC0
call printf
cmp rbx, 5
je .L1
xor eax, eax
mov esi, 5
mov edi, OFFSET FLAT:.LC0
call printf
cmp rbx, 6
je .L1
xor eax, eax
mov esi, 6
mov edi, OFFSET FLAT:.LC0
call printf
cmp rbx, 7
je .L1
xor eax, eax
mov esi, 7
mov edi, OFFSET FLAT:.LC0
call printf
cmp rbx, 8
je .L1
xor eax, eax
mov esi, 8
mov edi, OFFSET FLAT:.LC0
call printf
cmp rbx, 9
je .L1
mov esi, 9
mov edi, OFFSET FLAT:.LC0
xor eax, eax
pop rbx
jmp printf
Now if we slighly modify the code to
void to_test(size_t num_iters)
{
const size_t exec_iters = fmin(num_iters, 10);
#if __GNUC__
#pragma GCC unroll 10
#elif __clang__
#pragma unroll 10
#endif
for (size_t i = 0; i < exec_iters; ++i) {
printf("%d\n", i);
}
}
We see them producing different code. GCC Godbolt for gcc variant 2 performs a linear search thus lots of comparisons.
.L5:
test r12, r12
je .L1
mov rax, r12
xor ebx, ebx
and eax, 7
je .L7
cmp rax, 1
je .L32
cmp rax, 2
je .L33
cmp rax, 3
je .L34
cmp rax, 4
je .L35
cmp rax, 5
je .L36
cmp rax, 6
jne .L49
Clang makes it even weirder. It first checks if the value results in 10. if this is the case, it will use the fully unrolled loop (no if checks afterwards). But if its smaller, it doesn't do any loop unrolling and just performs the ordinary loop. Clang Godbolt for clang variant 2
cmp r12, 10
jb .LBB0_4
sub r12, r15
lea r14, [rip + .L.str]
xor ebx, ebx
.LBB0_3:
mov rdi, r14
mov rsi, rbx
xor eax, eax
call printf@PLT
lea rsi, [rbx + 1]
mov rdi, r14
xor eax, eax
call printf@PLT
lea rsi, [rbx + 2]
mov rdi, r14
xor eax, eax
call printf@PLT
lea rsi, [rbx + 3]
mov rdi, r14
xor eax, eax
call printf@PLT
lea rsi, [rbx + 4]
mov rdi, r14
xor eax, eax
call printf@PLT
lea rsi, [rbx + 5]
mov rdi, r14
xor eax, eax
call printf@PLT
lea rsi, [rbx + 6]
mov rdi, r14
xor eax, eax
call printf@PLT
lea rsi, [rbx + 7]
mov rdi, r14
xor eax, eax
call printf@PLT
lea rsi, [rbx + 8]
mov rdi, r14
xor eax, eax
call printf@PLT
lea rsi, [rbx + 9]
mov rdi, r14
xor eax, eax
call printf@PLT
add rbx, 10
cmp rbx, r12
jne .LBB0_3
.LBB0_4:
cmp r13, 10
je .LBB0_7
lea r14, [rip + .L.str]
.LBB0_6:
mov rdi, r14
mov rsi, rbx
xor eax, eax
call printf@PLT
inc rbx
dec r15
jne .LBB0_6
Final question, these variants all seem quite ineffective. Especially the linear search for the correct value by GCC or clang not doing any unrolling at all in the second case.
Is it possible to let the compiler do something more clever. For example something like this (so it calcs an initial start_index) (it has the max number of possible iterations) and then jump directly to the final location.
// assumption (exec_iters = 4)
// then start_idx = 4 - 10 = -6
const start_idx = -6;
jump label_fabs({start_idx})
:label_0
printf("%d\n", start_idx + 0); // i = 0
:label_1
printf("%d\n", start_idx + 1); // i = 1
:label_2
printf("%d\n", start_idx + 2); // i = 2
:label_3
printf("%d\n", start_idx + 3); // i = 3
:label_4
printf("%d\n", start_idx + 4); // i = 4
:label_5
printf("%d\n", start_idx + 5); // i = 5
:label_6
printf("%d\n", start_idx + 6); // i = 6
:label_7
printf("%d\n", start_idx + 7); // i = 7
:label_8
printf("%d\n", start_idx + 8); // i = 8
:label_9
printf("%d\n", start_idx + 9); // i = 9
Or even building 10 functions and jump directly to the one with the exact number of iterations.
Even fixing the end of the loop and having a variable start (which should be easier for the compiler to optimize), does not result in something more clever.
#define MIN(a,b) ((a) < (b) ? (a) : (b))
void to_test(size_t num_iters)
{
const size_t exec_iters = MIN(num_iters, 10);
int loop_start = 10 - exec_iters;
int base_offset = exec_iters - 10;
int running_offset = loop_start;
#if __GNUC__
#pragma GCC unroll 10
#elif __clang__
#pragma unroll 10
#endif
for (size_t i = loop_start; i < 10; ++i) {
printf("%d\n", base_offset + running_offset);
++running_offset;
}
}
GCC Version 14.2 Clang Version 19.1.0 Compile args for both: -Ofast -lm