• Loop unrolling is a common program optimisation technique that involves replacing the looping in a program with it’s iterated form

A normal loop

#include <iostream>
#include "utils.hpp"
 
#define ARRAY_SIZE 5
#define LOOP_COUNT 1000000000
 
void normal_loop(int*a, int*b, int*c) {
 
  for(int i=0;i<5;i++) {
    c[i] = a[i] + b[i];
  }
}
 
int main() {
  int a[] = {1, 2, 3, 4, 5};
  int b[] = {1, 2, 3, 4, 5};
  int c[] = {0, 0, 0, 0, 0};
 
  float time_taken;
 
  auto normal_f = [&]() { normal_loop(a, b, c); };
  log_time_n_runs("Normal loop", LOOP_COUNT, normal_f);
}

Corresponding Assembly code (armv8-a clang 15.0.0)

__cxx_global_var_init:                  // @__cxx_global_var_init
        sub     sp, sp, #32
        stp     x29, x30, [sp, #16]             // 16-byte Folded Spill
        add     x29, sp, #16
        adrp    x0, _ZStL8__ioinit
        add     x0, x0, :lo12:_ZStL8__ioinit
        str     x0, [sp, #8]                    // 8-byte Folded Spill
        bl      std::ios_base::Init::Init() [complete object constructor]
        ldr     x1, [sp, #8]                    // 8-byte Folded Reload
        adrp    x0, :got:_ZNSt8ios_base4InitD1Ev
        ldr     x0, [x0, :got_lo12:_ZNSt8ios_base4InitD1Ev]
        adrp    x2, __dso_handle
        add     x2, x2, :lo12:__dso_handle
        bl      __cxa_atexit
        ldp     x29, x30, [sp, #16]             // 16-byte Folded Reload
        add     sp, sp, #32
        ret
normal_loop(int*, int*, int*):                  // @normal_loop(int*, int*, int*)
        sub     sp, sp, #32
        str     x0, [sp, #24]
        str     x1, [sp, #16]
        str     x2, [sp, #8]
        str     wzr, [sp, #4]
        b       .LBB1_1
.LBB1_1:                                // =>This Inner Loop Header: Depth=1
        ldr     w8, [sp, #4]
        subs    w8, w8, #5
        cset    w8, ge
        tbnz    w8, #0, .LBB1_4
        b       .LBB1_2
.LBB1_2:                                //   in Loop: Header=BB1_1 Depth=1
        ldr     x8, [sp, #24]
        ldrsw   x9, [sp, #4]
        ldr     w8, [x8, x9, lsl #2]
        ldr     x9, [sp, #16]
        ldrsw   x10, [sp, #4]
        ldr     w9, [x9, x10, lsl #2]
        add     w8, w8, w9
        ldr     x9, [sp, #8]
        ldrsw   x10, [sp, #4]
        str     w8, [x9, x10, lsl #2]
        b       .LBB1_3
.LBB1_3:                                //   in Loop: Header=BB1_1 Depth=1
        ldr     w8, [sp, #4]
        add     w8, w8, #1
        str     w8, [sp, #4]
        b       .LBB1_1
.LBB1_4:
        add     sp, sp, #32
        ret
main:                                   // @main
        sub     sp, sp, #128
        adrp    x8, .L__const.main.a
        add     x8, x8, :lo12:.L__const.main.a
        ldr     q0, [x8]
        add     x10, sp, #96
        str     q0, [sp, #96]
        ldr     w8, [x8, #16]
        str     w8, [sp, #112]
        adrp    x8, .L__const.main.b
        add     x8, x8, :lo12:.L__const.main.b
        ldr     q0, [x8]
        add     x9, sp, #64
        str     q0, [sp, #64]
        ldr     w8, [x8, #16]
        str     w8, [sp, #80]
        add     x8, sp, #40
        str     xzr, [sp, #40]
        str     xzr, [sp, #48]
        mov     w0, wzr
        str     wzr, [sp, #56]
        str     x10, [sp, #8]
        str     x9, [sp, #16]
        str     x8, [sp, #24]
        add     sp, sp, #128
        ret
_GLOBAL__sub_I_example.cpp:             // @_GLOBAL__sub_I_example.cpp
        stp     x29, x30, [sp, #-16]!           // 16-byte Folded Spill
        mov     x29, sp
        bl      __cxx_global_var_init
        ldp     x29, x30, [sp], #16             // 16-byte Folded Reload
        ret
.L__const.main.a:
        .word   1                               // 0x1
        .word   2                               // 0x2
        .word   3                               // 0x3
        .word   4                               // 0x4
        .word   5                               // 0x5

.L__const.main.b:
        .word   1                               // 0x1
        .word   2                               // 0x2
        .word   3                               // 0x3
        .word   4                               // 0x4
        .word   5                               // 0x5

An unrolled loop

#include <iostream>
#include "utils.hpp"
 
#define ARRAY_SIZE 5
#define LOOP_COUNT 1000000000
 
void unrolled_loop(int*a, int*b, int*c) {
    c[0] = a[0] + b[0];
    c[1] = a[1] + b[1];
    c[2] = a[2] + b[2];
    c[3] = a[3] + b[3];
    c[4] = a[4] + b[4];
}
 
int main() {
  int a[] = {1, 2, 3, 4, 5};
  int b[] = {1, 2, 3, 4, 5};
  int c[] = {0, 0, 0, 0, 0};
 
  float time_taken;
 
  auto unrolled_f = [&]() { unrolled_loop(a, b, c); };
  log_time_n_runs("Unrolled loop", LOOP_COUNT, unrolled_f);
}

Corresponding assembly code,

__cxx_global_var_init:                  // @__cxx_global_var_init
        sub     sp, sp, #32
        stp     x29, x30, [sp, #16]             // 16-byte Folded Spill
        add     x29, sp, #16
        adrp    x0, _ZStL8__ioinit
        add     x0, x0, :lo12:_ZStL8__ioinit
        str     x0, [sp, #8]                    // 8-byte Folded Spill
        bl      std::ios_base::Init::Init() [complete object constructor]
        ldr     x1, [sp, #8]                    // 8-byte Folded Reload
        adrp    x0, :got:_ZNSt8ios_base4InitD1Ev
        ldr     x0, [x0, :got_lo12:_ZNSt8ios_base4InitD1Ev]
        adrp    x2, __dso_handle
        add     x2, x2, :lo12:__dso_handle
        bl      __cxa_atexit
        ldp     x29, x30, [sp, #16]             // 16-byte Folded Reload
        add     sp, sp, #32
        ret
unrolled_loop(int*, int*, int*):                // @unrolled_loop(int*, int*, int*)
        sub     sp, sp, #32
        str     x0, [sp, #24]
        str     x1, [sp, #16]
        str     x2, [sp, #8]
        ldr     x8, [sp, #24]
        ldr     w8, [x8]
        ldr     x9, [sp, #16]
        ldr     w9, [x9]
        add     w8, w8, w9
        ldr     x9, [sp, #8]
        str     w8, [x9]
        ldr     x8, [sp, #24]
        ldr     w8, [x8, #4]
        ldr     x9, [sp, #16]
        ldr     w9, [x9, #4]
        add     w8, w8, w9
        ldr     x9, [sp, #8]
        str     w8, [x9, #4]
        ldr     x8, [sp, #24]
        ldr     w8, [x8, #8]
        ldr     x9, [sp, #16]
        ldr     w9, [x9, #8]
        add     w8, w8, w9
        ldr     x9, [sp, #8]
        str     w8, [x9, #8]
        ldr     x8, [sp, #24]
        ldr     w8, [x8, #12]
        ldr     x9, [sp, #16]
        ldr     w9, [x9, #12]
        add     w8, w8, w9
        ldr     x9, [sp, #8]
        str     w8, [x9, #12]
        ldr     x8, [sp, #24]
        ldr     w8, [x8, #16]
        ldr     x9, [sp, #16]
        ldr     w9, [x9, #16]
        add     w8, w8, w9
        ldr     x9, [sp, #8]
        str     w8, [x9, #16]
        add     sp, sp, #32
        ret
main:                                   // @main
        sub     sp, sp, #128
        adrp    x8, .L__const.main.a
        add     x8, x8, :lo12:.L__const.main.a
        ldr     q0, [x8]
        add     x10, sp, #96
        str     q0, [sp, #96]
        ldr     w8, [x8, #16]
        str     w8, [sp, #112]
        adrp    x8, .L__const.main.b
        add     x8, x8, :lo12:.L__const.main.b
        ldr     q0, [x8]
        add     x9, sp, #64
        str     q0, [sp, #64]
        ldr     w8, [x8, #16]
        str     w8, [sp, #80]
        add     x8, sp, #40
        str     xzr, [sp, #40]
        str     xzr, [sp, #48]
        mov     w0, wzr
        str     wzr, [sp, #56]
        str     x10, [sp, #8]
        str     x9, [sp, #16]
        str     x8, [sp, #24]
        add     sp, sp, #128
        ret
_GLOBAL__sub_I_example.cpp:             // @_GLOBAL__sub_I_example.cpp
        stp     x29, x30, [sp, #-16]!           // 16-byte Folded Spill
        mov     x29, sp
        bl      __cxx_global_var_init
        ldp     x29, x30, [sp], #16             // 16-byte Folded Reload
        ret
.L__const.main.a:
        .word   1                               // 0x1
        .word   2                               // 0x2
        .word   3                               // 0x3
        .word   4                               // 0x4
        .word   5                               // 0x5

.L__const.main.b:
        .word   1                               // 0x1
        .word   2                               // 0x2
        .word   3                               // 0x3
        .word   4                               // 0x4
        .word   5                               // 0x5

What changed?


Refs