So I'm writing some program consisting of a simple heap with C. The problem pop up when I'm doing the 'pop' operation part, the whole 'pop' function looks like this——
typedef long long ll;
ll pop(){
ll res=hp[1];hp[1]=hp[used];
for(ll x=1,y=2;y<used;){
// ll w=(((y|1)<used)&&(hp[y|1]>=hp[y]))?(y|1):y;
ll w=y|(((y|1)<used)&&(hp[y|1]>=hp[y]));
if(hp[w]<=hp[x])break;
ll temp=hp[x];hp[x]=hp[w];hp[w]=temp;
x=w,y=w<<1;
}
used--;
return res;
}
Where 'hp' refers to the heap data, and 'used' is a global var showing the element count of the heap. Now the problem is, I tested the performance of line5 and line6, which I think do the same thing: to create a var 'w' with a certain value, but I found that line5 does much faster than line6(e.g. when repeating 'pop' 20million times, there's a difference of 0.1 seconds or more), and I can't figure out exactly why?
I thought it might take some time to transfer from bool to long long(if there ever exists such transfer), so I tried this below:
#include<stdio.h>
#include<time.h>
typedef __int128_t ll;
#define bool _Bool
int main(){
int a=clock();
// ll x=1;
bool x=1;
ll y=15415146514465;
for(ll i=0;i<200000000;i++){
ll z=x|y;
y+=z;
}
int b=clock();
printf("%lf",(b-a)/1000.0);
return 0;
}
and found a slight difference(less than 0.05 seconds when repeating 200million times) if I switch between two types of var 'x'. However, is it that all the performance difference up there is caused by the same problem below(it seems that the type transfer doesn't cost that much)? Is there perhaps another issue I did not notice? Is my assumption even right?
So I'm writing some program consisting of a simple heap with C. The problem pop up when I'm doing the 'pop' operation part, the whole 'pop' function looks like this——
typedef long long ll;
ll pop(){
ll res=hp[1];hp[1]=hp[used];
for(ll x=1,y=2;y<used;){
// ll w=(((y|1)<used)&&(hp[y|1]>=hp[y]))?(y|1):y;
ll w=y|(((y|1)<used)&&(hp[y|1]>=hp[y]));
if(hp[w]<=hp[x])break;
ll temp=hp[x];hp[x]=hp[w];hp[w]=temp;
x=w,y=w<<1;
}
used--;
return res;
}
Where 'hp' refers to the heap data, and 'used' is a global var showing the element count of the heap. Now the problem is, I tested the performance of line5 and line6, which I think do the same thing: to create a var 'w' with a certain value, but I found that line5 does much faster than line6(e.g. when repeating 'pop' 20million times, there's a difference of 0.1 seconds or more), and I can't figure out exactly why?
I thought it might take some time to transfer from bool to long long(if there ever exists such transfer), so I tried this below:
#include<stdio.h>
#include<time.h>
typedef __int128_t ll;
#define bool _Bool
int main(){
int a=clock();
// ll x=1;
bool x=1;
ll y=15415146514465;
for(ll i=0;i<200000000;i++){
ll z=x|y;
y+=z;
}
int b=clock();
printf("%lf",(b-a)/1000.0);
return 0;
}
and found a slight difference(less than 0.05 seconds when repeating 200million times) if I switch between two types of var 'x'. However, is it that all the performance difference up there is caused by the same problem below(it seems that the type transfer doesn't cost that much)? Is there perhaps another issue I did not notice? Is my assumption even right?
Share Improve this question asked Feb 1 at 13:28 suspect_xsuspect_x 11 silver badge2 bronze badges 5 |1 Answer
Reset to default 1Simply, the compiler is not compiling these lines the same way when you optimize. So the performance is slightly different.
typedef long long ll;
ll pop(ll *hp, size_t used, ll y)
{
ll w=(((y|1)<used)&&(hp[y|1]>=hp[y]))?(y|1):y;
return w;
}
ll pop1(ll *hp, size_t used, ll y)
{
ll w=y|(((y|1)<used)&&(hp[y|1]>=hp[y]));
return w;
}
pop:
mov rax, rdx
or rdx, 1
cmp rdx, rsi
jnb .L2
mov rcx, QWORD PTR [rdi+rax*8]
cmp QWORD PTR [rdi+rdx*8], rcx
cmovge rax, rdx
.L2:
ret
pop1:
mov rax, rdx
or rdx, 1
cmp rdx, rsi
jnb .L6
mov rcx, QWORD PTR [rdi+rax*8]
cmp QWORD PTR [rdi+rdx*8], rcx
setge dl
movzx edx, dl
or rax, rdx
.L6:
ret
https://godbolt./z/q7dK1Kqbs
https://godbolt./z/za866szvq
As a side note: your code is a perfect example of how not to name variables and structure code. It is completely unreadable.
clock()
the profiling method you used? Did you check the generated assembly instructions? The second presented code has no side effects, it's equal toprintf("%lf", (clock()-clock())/1000)
– KamilCuk Commented Feb 1 at 13:34clock
function is not specified, other than it should beCLOCKS_PER_SEC
. And that's beside the fact that the behavior ofclock
is platform-dependent (it measures different things in Windows and POSIX environments). – Some programmer dude Commented Feb 1 at 13:44w = y | <turgid_code>;
that the compiler is unable to see through to optimise it. You seem to confuse opaque cryptic unreadable code with higher performance. Nothing could be further from the truth. In the first form bothy
andy|1
are available and a conditional move can be used. Compilers really struggle when you try to micromanage optimisation in a HLL. – Martin Brown Commented Feb 1 at 13:54