- Loop unrolling is a common program optimisation technique that involves replacing the looping in a program with it’s iterated form
A normal loop
#include <iostream>
#include "utils.hpp"
#define ARRAY_SIZE 5
#define LOOP_COUNT 1000000000
void normal_loop(int*a, int*b, int*c) {
for(int i=0;i<5;i++) {
c[i] = a[i] + b[i];
}
}
int main() {
int a[] = {1, 2, 3, 4, 5};
int b[] = {1, 2, 3, 4, 5};
int c[] = {0, 0, 0, 0, 0};
float time_taken;
auto normal_f = [&]() { normal_loop(a, b, c); };
log_time_n_runs("Normal loop", LOOP_COUNT, normal_f);
}
Corresponding Assembly code (armv8-a clang 15.0.0)
__cxx_global_var_init: // @__cxx_global_var_init
sub sp, sp, #32
stp x29, x30, [sp, #16] // 16-byte Folded Spill
add x29, sp, #16
adrp x0, _ZStL8__ioinit
add x0, x0, :lo12:_ZStL8__ioinit
str x0, [sp, #8] // 8-byte Folded Spill
bl std::ios_base::Init::Init() [complete object constructor]
ldr x1, [sp, #8] // 8-byte Folded Reload
adrp x0, :got:_ZNSt8ios_base4InitD1Ev
ldr x0, [x0, :got_lo12:_ZNSt8ios_base4InitD1Ev]
adrp x2, __dso_handle
add x2, x2, :lo12:__dso_handle
bl __cxa_atexit
ldp x29, x30, [sp, #16] // 16-byte Folded Reload
add sp, sp, #32
ret
normal_loop(int*, int*, int*): // @normal_loop(int*, int*, int*)
sub sp, sp, #32
str x0, [sp, #24]
str x1, [sp, #16]
str x2, [sp, #8]
str wzr, [sp, #4]
b .LBB1_1
.LBB1_1: // =>This Inner Loop Header: Depth=1
ldr w8, [sp, #4]
subs w8, w8, #5
cset w8, ge
tbnz w8, #0, .LBB1_4
b .LBB1_2
.LBB1_2: // in Loop: Header=BB1_1 Depth=1
ldr x8, [sp, #24]
ldrsw x9, [sp, #4]
ldr w8, [x8, x9, lsl #2]
ldr x9, [sp, #16]
ldrsw x10, [sp, #4]
ldr w9, [x9, x10, lsl #2]
add w8, w8, w9
ldr x9, [sp, #8]
ldrsw x10, [sp, #4]
str w8, [x9, x10, lsl #2]
b .LBB1_3
.LBB1_3: // in Loop: Header=BB1_1 Depth=1
ldr w8, [sp, #4]
add w8, w8, #1
str w8, [sp, #4]
b .LBB1_1
.LBB1_4:
add sp, sp, #32
ret
main: // @main
sub sp, sp, #128
adrp x8, .L__const.main.a
add x8, x8, :lo12:.L__const.main.a
ldr q0, [x8]
add x10, sp, #96
str q0, [sp, #96]
ldr w8, [x8, #16]
str w8, [sp, #112]
adrp x8, .L__const.main.b
add x8, x8, :lo12:.L__const.main.b
ldr q0, [x8]
add x9, sp, #64
str q0, [sp, #64]
ldr w8, [x8, #16]
str w8, [sp, #80]
add x8, sp, #40
str xzr, [sp, #40]
str xzr, [sp, #48]
mov w0, wzr
str wzr, [sp, #56]
str x10, [sp, #8]
str x9, [sp, #16]
str x8, [sp, #24]
add sp, sp, #128
ret
_GLOBAL__sub_I_example.cpp: // @_GLOBAL__sub_I_example.cpp
stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
mov x29, sp
bl __cxx_global_var_init
ldp x29, x30, [sp], #16 // 16-byte Folded Reload
ret
.L__const.main.a:
.word 1 // 0x1
.word 2 // 0x2
.word 3 // 0x3
.word 4 // 0x4
.word 5 // 0x5
.L__const.main.b:
.word 1 // 0x1
.word 2 // 0x2
.word 3 // 0x3
.word 4 // 0x4
.word 5 // 0x5
An unrolled loop
#include <iostream>
#include "utils.hpp"
#define ARRAY_SIZE 5
#define LOOP_COUNT 1000000000
void unrolled_loop(int*a, int*b, int*c) {
c[0] = a[0] + b[0];
c[1] = a[1] + b[1];
c[2] = a[2] + b[2];
c[3] = a[3] + b[3];
c[4] = a[4] + b[4];
}
int main() {
int a[] = {1, 2, 3, 4, 5};
int b[] = {1, 2, 3, 4, 5};
int c[] = {0, 0, 0, 0, 0};
float time_taken;
auto unrolled_f = [&]() { unrolled_loop(a, b, c); };
log_time_n_runs("Unrolled loop", LOOP_COUNT, unrolled_f);
}
Corresponding assembly code,
__cxx_global_var_init: // @__cxx_global_var_init
sub sp, sp, #32
stp x29, x30, [sp, #16] // 16-byte Folded Spill
add x29, sp, #16
adrp x0, _ZStL8__ioinit
add x0, x0, :lo12:_ZStL8__ioinit
str x0, [sp, #8] // 8-byte Folded Spill
bl std::ios_base::Init::Init() [complete object constructor]
ldr x1, [sp, #8] // 8-byte Folded Reload
adrp x0, :got:_ZNSt8ios_base4InitD1Ev
ldr x0, [x0, :got_lo12:_ZNSt8ios_base4InitD1Ev]
adrp x2, __dso_handle
add x2, x2, :lo12:__dso_handle
bl __cxa_atexit
ldp x29, x30, [sp, #16] // 16-byte Folded Reload
add sp, sp, #32
ret
unrolled_loop(int*, int*, int*): // @unrolled_loop(int*, int*, int*)
sub sp, sp, #32
str x0, [sp, #24]
str x1, [sp, #16]
str x2, [sp, #8]
ldr x8, [sp, #24]
ldr w8, [x8]
ldr x9, [sp, #16]
ldr w9, [x9]
add w8, w8, w9
ldr x9, [sp, #8]
str w8, [x9]
ldr x8, [sp, #24]
ldr w8, [x8, #4]
ldr x9, [sp, #16]
ldr w9, [x9, #4]
add w8, w8, w9
ldr x9, [sp, #8]
str w8, [x9, #4]
ldr x8, [sp, #24]
ldr w8, [x8, #8]
ldr x9, [sp, #16]
ldr w9, [x9, #8]
add w8, w8, w9
ldr x9, [sp, #8]
str w8, [x9, #8]
ldr x8, [sp, #24]
ldr w8, [x8, #12]
ldr x9, [sp, #16]
ldr w9, [x9, #12]
add w8, w8, w9
ldr x9, [sp, #8]
str w8, [x9, #12]
ldr x8, [sp, #24]
ldr w8, [x8, #16]
ldr x9, [sp, #16]
ldr w9, [x9, #16]
add w8, w8, w9
ldr x9, [sp, #8]
str w8, [x9, #16]
add sp, sp, #32
ret
main: // @main
sub sp, sp, #128
adrp x8, .L__const.main.a
add x8, x8, :lo12:.L__const.main.a
ldr q0, [x8]
add x10, sp, #96
str q0, [sp, #96]
ldr w8, [x8, #16]
str w8, [sp, #112]
adrp x8, .L__const.main.b
add x8, x8, :lo12:.L__const.main.b
ldr q0, [x8]
add x9, sp, #64
str q0, [sp, #64]
ldr w8, [x8, #16]
str w8, [sp, #80]
add x8, sp, #40
str xzr, [sp, #40]
str xzr, [sp, #48]
mov w0, wzr
str wzr, [sp, #56]
str x10, [sp, #8]
str x9, [sp, #16]
str x8, [sp, #24]
add sp, sp, #128
ret
_GLOBAL__sub_I_example.cpp: // @_GLOBAL__sub_I_example.cpp
stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
mov x29, sp
bl __cxx_global_var_init
ldp x29, x30, [sp], #16 // 16-byte Folded Reload
ret
.L__const.main.a:
.word 1 // 0x1
.word 2 // 0x2
.word 3 // 0x3
.word 4 // 0x4
.word 5 // 0x5
.L__const.main.b:
.word 1 // 0x1
.word 2 // 0x2
.word 3 // 0x3
.word 4 // 0x4
.word 5 // 0x5
What changed?
Refs
- https://godbolt.org/ - site to look at the generated assembly code