I am trying to accelerate the computation of the dot product of 2 float arrays on an ARM Cortex A7. I benchmark the pure C code, the code using intrinsics and the assembly code. But my assembly code does not work and throws a segmentation fault.
One important requirement is that the assembly code must not modify the input parameters, as they may be used by other parts of the code later on.
Here is the code:
// Test dot product of 2 arrays in assembly for Neon
#include <stdio.h>
#include <stdlib.h>
float dotProductAsm(const float *a, const float *b, int n) {
float dp = 0.0f;
asm volatile(
"vpush {q4-q7} \n" // Sauvegarde des registres
// Copie des adresses dans des registres temporaires
"mov r4, %0 \n" // r4 = adresse de a
"mov r5, %1 \n" // r5 = adresse de b
"mov r6, %2 \n" // r6 = size
"mov r10, %3 \n" // r10 = dp
"vmov.f32 q4, #0.0 \n" // Accumulateur = 0
// Calcul du nombre de blocs de 4
"lsr r9, r6, #2 \n" // r9 = size/4
"and r8, r6, #3 \n" // r8 = size%4 (reste)
// Si pas de blocs complets, aller aux restants
"cmp r9, #0 \n"
"beq 2f \n"
// Boucle principale (blocs de 4)
"1: \n"
"vld1.32 {q5}, [r4]! \n"
"vld1.32 {q6}, [r5]! \n"
"vmla.f32 q4, q5, q6 \n"
"subs r9, r9, #1 \n"
"bgt 1b \n"
// Traitement des éléments restants
"2: \n"
"cmp r8, #0 \n"
"beq 4f \n"
"3: \n"
// "vld1.32 {s20}, [r4]! \n"
// "vld1.32 {s21}, [r5]! \n"
"vldm.32 r4!, {s20} \n"
"vldm.32 r5!, {s21} \n"
"vmul.f32 s20, s20, s21 \n"
"vadd.f32 s16, s16, s20 \n"
"subs r8, r8, #1 \n"
"bgt 3b \n"
"4: \n" // Réduction finale
"vadd.f32 d8, d8, d9 \n"
"vpadd.f32 d8, d8, d8 \n"
// Stockage du résultat
"vstr.32 s16, [r10] \n" //
// "vst1.32 {d8[0]}, [%3] \n"
"vpop {q4-q7} \n" // Restauration des registres
:
: "r"(a), "r"(b), "r"(n), "r"(dp)
: "r4", "r5", "r6", "r9", "r8", "r10","memory", "cc"
);
printf("\tdotprod = %f\n", dp);
return dp;
}
int main() {
float a[150];
float b[150];
for (int i=0; i<150; ++i) {
a[i] = (float)i;
b[i] = 1.0;
}
printf("ok1\n");
float dp = dotProductAsm(a, b, 15);
return 0;
}
The code is compiled using:
arm-linux-gnueabihf-g++ -mcpu=cortex-a7 -mfloat-abi=hard -mfpu=neon -O3 -ftree-vectorize TestDPasm2.c -o TestDPasm2
If I remove just the line:
"vstr.32 s16, [r10] \n"
the code runs well but I get a zero result. So it must be this line which is wrong. How should I change this line ?
I think that the vpush
and vpop
lines are not required as well. Is it correct?
I am trying to accelerate the computation of the dot product of 2 float arrays on an ARM Cortex A7. I benchmark the pure C code, the code using intrinsics and the assembly code. But my assembly code does not work and throws a segmentation fault.
One important requirement is that the assembly code must not modify the input parameters, as they may be used by other parts of the code later on.
Here is the code:
// Test dot product of 2 arrays in assembly for Neon
#include <stdio.h>
#include <stdlib.h>
float dotProductAsm(const float *a, const float *b, int n) {
float dp = 0.0f;
asm volatile(
"vpush {q4-q7} \n" // Sauvegarde des registres
// Copie des adresses dans des registres temporaires
"mov r4, %0 \n" // r4 = adresse de a
"mov r5, %1 \n" // r5 = adresse de b
"mov r6, %2 \n" // r6 = size
"mov r10, %3 \n" // r10 = dp
"vmov.f32 q4, #0.0 \n" // Accumulateur = 0
// Calcul du nombre de blocs de 4
"lsr r9, r6, #2 \n" // r9 = size/4
"and r8, r6, #3 \n" // r8 = size%4 (reste)
// Si pas de blocs complets, aller aux restants
"cmp r9, #0 \n"
"beq 2f \n"
// Boucle principale (blocs de 4)
"1: \n"
"vld1.32 {q5}, [r4]! \n"
"vld1.32 {q6}, [r5]! \n"
"vmla.f32 q4, q5, q6 \n"
"subs r9, r9, #1 \n"
"bgt 1b \n"
// Traitement des éléments restants
"2: \n"
"cmp r8, #0 \n"
"beq 4f \n"
"3: \n"
// "vld1.32 {s20}, [r4]! \n"
// "vld1.32 {s21}, [r5]! \n"
"vldm.32 r4!, {s20} \n"
"vldm.32 r5!, {s21} \n"
"vmul.f32 s20, s20, s21 \n"
"vadd.f32 s16, s16, s20 \n"
"subs r8, r8, #1 \n"
"bgt 3b \n"
"4: \n" // Réduction finale
"vadd.f32 d8, d8, d9 \n"
"vpadd.f32 d8, d8, d8 \n"
// Stockage du résultat
"vstr.32 s16, [r10] \n" //
// "vst1.32 {d8[0]}, [%3] \n"
"vpop {q4-q7} \n" // Restauration des registres
:
: "r"(a), "r"(b), "r"(n), "r"(dp)
: "r4", "r5", "r6", "r9", "r8", "r10","memory", "cc"
);
printf("\tdotprod = %f\n", dp);
return dp;
}
int main() {
float a[150];
float b[150];
for (int i=0; i<150; ++i) {
a[i] = (float)i;
b[i] = 1.0;
}
printf("ok1\n");
float dp = dotProductAsm(a, b, 15);
return 0;
}
The code is compiled using:
arm-linux-gnueabihf-g++ -mcpu=cortex-a7 -mfloat-abi=hard -mfpu=neon -O3 -ftree-vectorize TestDPasm2.c -o TestDPasm2
If I remove just the line:
"vstr.32 s16, [r10] \n"
the code runs well but I get a zero result. So it must be this line which is wrong. How should I change this line ?
I think that the vpush
and vpop
lines are not required as well. Is it correct?
1 Answer
Reset to default 3You're passing for %3
the value of dp
when your code assumes that the address is passed. Use "r"(&dp)
to pass the address. The other operands are pointers, so they do what you expect.
That said, there is much to improve with this code. I strongly advise against writing longer blocks of code with inline assembly. Split your inline assembly into statements as small as possible, ideally one asm statement per instruction. Express loops and conditionals with C loops and conditionals and everything that can be done in C, do in C. Instead of using fixed registers and moving from input operands into these registers, try to express all data operations in terms of input and output operands. Let the compiler worry about picking registers; the resulting code will be much better.
Also consider using intrinsic functions instead of inline assembly.
If you do want to write a whole block of code in assembly, strongly consider writing that code in assembly in an assembly source file and link it into the program.
vpush
? Why not just declare clobbers on those vector regs instead of changing the stack pointer? That should be safe, though; the ARM ABI doesn't have a red-zone and you aren't using any"m"
memory operands that could have SP-relative addressing modes which would be broken by changing the stack pointer. – Peter Cordes Commented Mar 20 at 11:50float dp
in a GPR, not its address. But your asm uses that bit-pattern as an address. Use an"=t"
output constraint to tell the compiler you're leaving the float value in an FP register so it doesn't have to reload it from memory. gcc.gnu./onlinedocs/gcc/Machine-Constraints.html (Or better, use intrinsics if you don't know how to write inline asm constraints to avoid all thosemov
instructions at the start to copy every input. stackoverflow/tags/inline-assembly/info .) – Peter Cordes Commented Mar 20 at 11:54