How can a nop instruction speed up my test program?
Posted: Sun Oct 30, 2022 11:48 am
I write a test program to test cache performance on x86 and found an unexpected problem.
The asm corresponding to the loop is as following:
The program will print:
But if I insert a nop instruction:
My program is compiled with O0.
I can't understand how the nop instruction speed up my program. Thanks!
Code: Select all
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <stdlib.h>
#define BUF_SIZE 8192
#define ROUND 100000000UL
int main(int argc, char **argv)
{
char *buf, *buf_newaddr, *buf_pageend;
unsigned long i __attribute__((aligned(64)));
int buf_realsize;
unsigned long offset __attribute__((aligned(64)));
struct timespec start={0,0}, end={0,0};
double start_ns, end_ns;
if (argc != 2) {
printf("missing args\n");
exit(-1);
}
offset = atoi(argv[1]);
again:
buf = (void *)malloc(BUF_SIZE);
buf_pageend = (void *)((unsigned long)(buf + 4095) & 0xfffffffffffff000UL);
if (buf_pageend - buf < 1024) { // make sure we have enough space for negative 'offset'
// don't free, occupt it in order to alloc another different block
goto again;
}
memset(buf, 0, BUF_SIZE);
printf("&i = %lx, &offset=%lx\n", &i, &offset);
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = 0; i < ROUND; i++) {
*((unsigned long *)(buf_pageend + offset)) = 0; // mark
}
clock_gettime(CLOCK_MONOTONIC, &end);
start_ns = start.tv_sec*1000000000 + start.tv_nsec;
end_ns = end.tv_sec*1000000000 + end.tv_nsec;
printf("ns: %lf\n", (end_ns - start_ns)/ROUND);
}
Code: Select all
call clock_gettime
movq $0, -112(%rbp)
jmp .L5
.L6:
movq -176(%rbp), %rdx
movq -64(%rbp), %rax
addq %rdx, %rax
movq $0, (%rax)
movq -112(%rbp), %rax
addq $1, %rax
movq %rax, -112(%rbp)
.L5:
movq -112(%rbp), %rax
cmpq $99999999, %rax
jbe .L6
leaq -208(%rbp), %rax
movq %rax, %rsi
movl $1, %edi
call clock_gettime
Code: Select all
$ ./a.out 0
&i = 7fffd9811700, &offset=7fffd98116c0
ns: 3.217088
The result will be:.L6:
movq -176(%rbp), %rdx
movq -64(%rbp), %rax
addq %rdx, %rax
nop
movq $0, (%rax)
movq -112(%rbp), %rax
addq $1, %rax
movq %rax, -112(%rbp)
.L5:
Code: Select all
$ ./a.out 0
&i = 7ffef00bb380, &offset=7ffef00bb340
ns: 2.104905
I can't understand how the nop instruction speed up my program. Thanks!