Rust should use registers more aggressively #26494

reinerp · 2015-06-22T05:58:38Z

Rust should pass more structs in registers. Consider these examples: ideally, both functions would execute entirely in registers and wouldn't touch memory:

// Passing small structs by value.
pub fn parameters_by_value(v: (u64, u64)) -> u64 {                                                                                                                                                          
  v.0 + v.1
}

// Returning small structs by value.
pub fn return_by_value() -> (u64, u64) {
  (3, 4)
}

Rust, as of a recent 1.2.0-dev nightly, is unable to pass either of these in registers (see LLVM IR and ASM below). It would be pretty safe to pass and return small structs (ones that fit into <=2 registers) in registers, and is likely to improve performance on average. This is what the System V ABI does.

It would also be nice to exploit Rust's control over aliasing, and where possible also promote reference arguments to registers, i.e. put the u64 values in registers for the following functions:

// Passing small structs by reference.
pub fn parameters_by_ref(v: &(u64, u64)) -> u64 {                                                                                                                                                           
  v.0 + v.1
}

// Passing small structs by *mutable* reference.                                                                                                                                                            
pub fn mutable_parameters_by_ref(v: &mut (u64, u64)) {                                                                                                                                                      
  v.0 += 1;
  v.1 += 2;
}

In the &mut case, this would mean passing two u64 values in registers as function parameters, and returning two u64 values in registers as the return values (ideally we'd arrange for the parameter registers to match the return registers). Uniqueness of &mut makes this optimization valid, although we may have to give up on this optimization in cases such as when there are raw pointers present.

Here's a more realistic example where I've wanted Rust to do this:

pub fn skip_whitespace(iter: &mut std::str::Chars) -> u64 {
  // Reads as much whitespace as possible from the front of iter, then returns the number of
  // characters read.                                                                                                                                                                                       
  ...
}

This function is too large to justify inlining. I'd like the begin and end pointers of iter to be kept in registers across the function call.

Probably not surprising to the compiler team, but for completeness here is the LLVM IR of the above snippets, as of today's Rust (1.2.0-dev), compiled in release mode / opt-level=3:

define i64 @_ZN19parameters_by_value20h3d287104250c57c4eaaE({ i64, i64 }* noalias nocapture dereferenceable(16)) unnamed_addr #0 {
entry-block:
  %1 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i64 0, i32 0
  %2 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i64 0, i32 1
  %3 = load i64, i64* %1, align 8
  %4 = load i64, i64* %2, align 8
  %5 = add i64 %4, %3
  %6 = bitcast { i64, i64 }* %0 to i8*
  tail call void @llvm.lifetime.end(i64 16, i8* %6)
  ret i64 %5
}

define void @_ZN15return_by_value20h703d16a2e5f298d6saaE({ i64, i64 }* noalias nocapture sret dereferenceable(16)) unnamed_addr #0 {
entry-block:
  %1 = bitcast { i64, i64 }* %0 to i8*
  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ({ i64, i64 }* @const1285 to i8*), i64 16, i32 8, i1 false)
  ret void
}

define i64 @_ZN17parameters_by_ref20hc9c548b23d173a1aBaaE({ i64, i64 }* noalias nocapture readonly dereferenceable(16)) unnamed_addr #2 {
entry-block:
  %1 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i64 0, i32 0
  %2 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i64 0, i32 1
  %3 = load i64, i64* %1, align 8
  %4 = load i64, i64* %2, align 8
  %5 = add i64 %4, %3
  ret i64 %5
}

define void @_ZN25mutable_parameters_by_ref20h736bc2daba227c43QaaE({ i64, i64 }* noalias nocapture dereferenceable(16)) unnamed_addr #0 {
entry-block:
  %1 = bitcast { i64, i64 }* %0 to <2 x i64>*
  %2 = load <2 x i64>, <2 x i64>* %1, align 8
  %3 = add <2 x i64> %2, <i64 1, i64 2>
  %4 = bitcast { i64, i64 }* %0 to <2 x i64>*
  store <2 x i64> %3, <2 x i64>* %4, align 8
  ret void
}

and here is the ASM:

_ZN19parameters_by_value20h3d287104250c57c4eaaE:
        .cfi_startproc
        movq    8(%rdi), %rax
        addq    (%rdi), %rax
        retq

_ZN15return_by_value20h703d16a2e5f298d6saaE:
        .cfi_startproc
        movups  const1285(%rip), %xmm0
        movups  %xmm0, (%rdi)
        movq    %rdi, %rax
        retq

_ZN17parameters_by_ref20hc9c548b23d173a1aBaaE:
        .cfi_startproc
        movq    8(%rdi), %rax
        addq    (%rdi), %rax
        retq

_ZN25mutable_parameters_by_ref20h736bc2daba227c43QaaE:
        .cfi_startproc
        movdqu  (%rdi), %xmm0
        paddq   .LCPI3_0(%rip), %xmm0
        movdqu  %xmm0, (%rdi)
        retq

The text was updated successfully, but these errors were encountered:

dotdash · 2015-06-22T06:18:02Z

For fat pointers this has been fixed in #26411
Am 22.06.2015 07:58 schrieb "Reiner Pope" notifications@github.com:

Rust should pass more structs in registers. Consider these examples:
ideally, both functions would execute entirely in registers and wouldn't
touch memory:

// Passing small structs by value.
pub fn parameters_by_value(v: (u64, u64)) -> u64 {
v.0 + v.1
}

// Returning small structs by value.
pub fn return_by_value() -> (u64, u64) {
(3, 4)
}

Rust, as of a recent 1.2.0-dev nightly, is unable to pass either of these
in registers (see LLVM IR and ASM below). It would be pretty safe to pass
and return small structs (ones that fit into <=2 registers) in registers,
and is likely to improve performance on average. This is what the System V
ABI does.

It would also be nice to exploit Rust's control over aliasing, and where
possible also promote reference arguments to registers, i.e. put the u64
values in registers for the following functions:

// Passing small structs by reference.
pub fn parameters_by_ref(v: &(u64, u64)) -> u64 {
v.0 + v.1
}

// Passing small structs by mutable reference.
pub fn mutable_parameters_by_ref(v: &mut (u64, u64)) {
v.0 += 1;
v.1 += 2;
}

In the &mut case, this would mean passing two u64 values in registers as
function parameters, and returning two u64 values in registers as the
return values (ideally we'd arrange for the parameter registers to match
the return registers). Uniqueness of &mut makes this optimization valid,
although we may have to give up on this optimization in cases such as when
there are raw pointers present.

Here's a more realistic example where I've wanted Rust to do this:

pub fn skip_whitespace(iter: &mut std::str::Chars) -> u64 {
// Reads as much whitespace as possible from the front of iter, then returns the number of
// characters read.
...
}

This function is too large to justify inlining. I'd like the begin and end
pointers of iter to be kept in registers across the function call.

Probably not surprising to the compiler team, but for completeness here is
the LLVM IR of the above snippets, as of today's Rust (1.2.0-dev), compiled
in release mode / opt-level=3:

define i64 @_ZN19parameters_by_value20h3d287104250c57c4eaaE({ i64, i64 }* noalias nocapture dereferenceable(16)) unnamed_addr #0 {
entry-block:
%1 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i64 0, i32 0
%2 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i64 0, i32 1
%3 = load i64, i64* %1, align 8
%4 = load i64, i64* %2, align 8
%5 = add i64 %4, %3
%6 = bitcast { i64, i64 }* %0 to i8*
tail call void @llvm.lifetime.end(i64 16, i8* %6)
ret i64 %5
}

define void @_ZN15return_by_value20h703d16a2e5f298d6saaE({ i64, i64 }* noalias nocapture sret dereferenceable(16)) unnamed_addr #0 {
entry-block:
%1 = bitcast { i64, i64 }* %0 to i8*
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ({ i64, i64 }* @const1285 to i8*), i64 16, i32 8, i1 false)
ret void
}

define i64 @_ZN17parameters_by_ref20hc9c548b23d173a1aBaaE({ i64, i64 }* noalias nocapture readonly dereferenceable(16)) unnamed_addr #2 {
entry-block:
%1 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i64 0, i32 0
%2 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i64 0, i32 1
%3 = load i64, i64* %1, align 8
%4 = load i64, i64* %2, align 8
%5 = add i64 %4, %3
ret i64 %5
}

define void @_ZN25mutable_parameters_by_ref20h736bc2daba227c43QaaE({ i64, i64 }* noalias nocapture dereferenceable(16)) unnamed_addr #0 {
entry-block:
%1 = bitcast { i64, i64 }* %0 to <2 x i64>*
%2 = load <2 x i64>, <2 x i64>* %1, align 8
%3 = add <2 x i64> %2, <i64 1, i64 2>
%4 = bitcast { i64, i64 }* %0 to <2 x i64>*
store <2 x i64> %3, <2 x i64>* %4, align 8
ret void
}

and here is the ASM:

_ZN19parameters_by_value20h3d287104250c57c4eaaE:
.cfi_startproc
movq 8(%rdi), %rax
addq (%rdi), %rax
retq

_ZN15return_by_value20h703d16a2e5f298d6saaE:
.cfi_startproc
movups const1285(%rip), %xmm0
movups %xmm0, (%rdi)
movq %rdi, %rax
retq

_ZN17parameters_by_ref20hc9c548b23d173a1aBaaE:
.cfi_startproc
movq 8(%rdi), %rax
addq (%rdi), %rax
retq

_ZN25mutable_parameters_by_ref20h736bc2daba227c43QaaE:
.cfi_startproc
movdqu (%rdi), %xmm0
paddq .LCPI3_0(%rip), %xmm0
movdqu %xmm0, (%rdi)
retq

—
Reply to this email directly or view it on GitHub
#26494.

arielb1 · 2015-06-22T07:32:43Z

On skip_whitespace etc. this is currently impossible because the address of references is significant (you can e.g. print it with println!("{:?}", iter as *const _)).

reinerp · 2015-06-22T16:50:19Z

@arielb1 - yes, the ability to take addresses prevents doing this for all functions. But you could see doing this as an optimization: make the compiler check whether a function uses the address of a reference, and if not then change its calling convention. The compiler can use the worker/wrapper technique (as in GHC) to support call sites which aren't aware of this change in calling convention, by splitting the function into two:

// Non-inlined worker, uses fast in-register calling convention
pub fn skip_whitespace_worker(start: u8*, end: u8*) -> (u8*, u8*, u64) {
  ...
}

// Inlined wrapper, moves reference argument to registers and then calls the worker.
pub fn skip_whitespace(v: &mut std::str::Chars) -> u64 {
  // Approximately:
  let (start, end, len) = skip_whitespace_worker(v.start, v.end);
  mut.start = start;
  mut.end = end;
  len
}

pcwalton · 2015-07-20T17:57:05Z

Don't do this. You will break FastISel. Fix LLVM if it's not doing the optimizations you want.

dotdash · 2015-07-20T20:21:06Z

@pcwalton what exactly would break FastISel?

nox · 2018-04-02T14:27:44Z

Pretty sure that since @eddyb's work on niche-filling optimisation, parameters_by_value and return_by_value use a pair of registers for the argument and the return value, respectively.

Cc @rust-lang/wg-codegen

gnzlbg · 2019-09-14T17:34:14Z

@eddyb didn't Abi::ScalarPair solve this?

gnzlbg · 2019-09-14T17:39:07Z

Yes, this appears to be fixed. The following Rust code (https://rust.godbolt.org/z/LhQGOM):

// Passing small structs by value.
pub fn parameters_by_value(v: (u64, u64)) -> u64 {                                                                                                                                                          
  v.0 + v.1
}

// Returning small structs by value.
pub fn return_by_value() -> (u64, u64) {
  (3, 4)
}

generates

example::parameters_by_value:
        lea     rax, [rdi + rsi]
        ret

example::return_by_value:
        mov     eax, 3
        mov     edx, 4
        ret

and

define i64 @parameters_by_value(i64 %v.0, i64 %v.1) unnamed_addr #0 {
start:
  %0 = add i64 %v.1, %v.0
  ret i64 %0
}

define { i64, i64 } @return_by_value() unnamed_addr #0 {
start:
  ret { i64, i64 } { i64 3, i64 4 }
}

gnzlbg · 2019-09-14T17:41:39Z

Ah no, I see that this issue also suggests:

It would also be nice to exploit Rust's control over aliasing, and where possible also promote reference arguments to registers, i.e. put the u64 values in registers for the following functions:

We don't do that yet: https://rust.godbolt.org/z/qnK9Ta .

FWIW Maybe it is worth to split this issue into two. One part has been closed already AFAICT.

The other part (passing &T as a register containing a T) is... a suggestion at most. This issue does not suggest how to do this. @pcwalton already raised the issue that this breaks if the address is inspected. It probably needs to handle writes through &T (e.g. due to UnsafeCell) and &mut T somehow as well, etc. So I'm skeptic that this can work.

matklad · 2020-04-26T08:09:04Z

Here's another slightly more complex example, where extern "C" ABI seems to be more efficient than extern "rust":

pub struct Stats { x: u32, y: u32, z: u32, }

pub extern "C" fn sum_c(a: &Stats, b: &Stats) -> Stats {
    return Stats {x: a.x + b.x, y: a.y + b.y, z: a.z + b.z };
}

pub fn sum_rust(a: &Stats, b: &Stats) -> Stats {
    return Stats {x: a.x + b.x, y: a.y + b.y, z: a.z + b.z };
}

godbolt

leonardo-m · 2020-09-21T07:35:42Z

A related thread:
https://users.rust-lang.org/t/regarding-funciton-calls/48359

Pass arguments up to 2*usize by value In rust-lang#77434 (comment), `@eddyb` said: > I wonder if it makes sense to limit this to returns [...] Let's do a perf run and find out. It seems the `extern "C"` ABI will pass arguments up to 2*usize in registers: https://godbolt.org/z/n8E6zc. (modified from rust-lang#26494 (comment)) r? `@nagisa`

oilaba · 2021-01-01T22:18:35Z

@matklad is this fixed? Linked code is still bigger on Rust 1.48.

matklad · 2021-01-03T08:48:20Z

@oilaba my example is fixed, the return value is no longer via stack:

before:
        mov     dword ptr [rdi], ecx
        mov     dword ptr [rdi + 4], r8d
        mov     dword ptr [rdi + 8], edx
        ret

after:
        movd    edx, xmm0
        movd    ecx, xmm1
        shl     rcx, 32
        or      rax, rcx
        ret

The difference is that rust abi version now uses a vectorized add, which seems like a win?

shampoofactory · 2021-12-07T02:56:27Z

Hi all. I've traced back the cause of 'P-high' issue #85265 to two pulls that relate to this issue, namely #76986 and #79547.

Basically, these commits break auto-vectorization post Rust version 1.48 in a way that is not easily worked around.

In short, if we take:

pub fn case_1(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
    [
        a[0] + b[0],
        a[1] + b[1],
        a[2] + b[2],
        a[3] + b[3],
    ]
}

1.47 yields an efficient:

https://rust.godbolt.org/z/v89zoajse

example::case_1:
        mov     rax, rdi
        movups  xmm0, xmmword ptr [rsi]
        movups  xmm1, xmmword ptr [rdx]
        addps   xmm1, xmm0
        movups  xmmword ptr [rdi], xmm1
        ret

1.48 yields an awkward:

https://rust.godbolt.org/z/hqT85doqf

example::case_1:
        movss   xmm0, dword ptr [rdi]
        movss   xmm1, dword ptr [rdi + 4]
        addss   xmm0, dword ptr [rsi]
        addss   xmm1, dword ptr [rsi + 4]
        movsd   xmm2, qword ptr [rdi + 8]
        movsd   xmm3, qword ptr [rsi + 8]
        addps   xmm3, xmm2
        movd    eax, xmm0
        movd    ecx, xmm1
        movd    esi, xmm3
        shufps  xmm3, xmm3, 229
        movd    edx, xmm3
        shl     rdx, 32
        or      rdx, rsi
        shl     rcx, 32
        or      rax, rcx
        ret

1.50 yields an even more awkward cascade effect:

https://rust.godbolt.org/z/vvEePzGEM

example::case_1:
        movd    xmm0, esi
        shr     rsi, 32
        movd    xmm1, edi
        shr     rdi, 32
        movd    xmm2, edi
        punpckldq       xmm1, xmm2
        movd    xmm2, esi
        movd    xmm3, edx
        shr     rdx, 32
        movd    xmm4, edx
        punpckldq       xmm3, xmm4
        movd    xmm4, ecx
        shr     rcx, 32
        addps   xmm3, xmm1
        addss   xmm4, xmm0
        movd    xmm0, ecx
        addss   xmm0, xmm2
        movd    edx, xmm4
        movd    eax, xmm0
        shl     rax, 32
        or      rdx, rax
        movd    ecx, xmm3
        shufps  xmm3, xmm3, 229
        movd    eax, xmm3
        shl     rax, 32
        or      rax, rcx
        ret

There are other examples in #85265.

My overly simplistic analysis, I'm no expert, is here.

In summary, if I rebuild Rust with the above pulls neutered, auto-vectorization appears to return to normal, as per 1.47.

sfackler added the A-codegen Area: Code generation label Jun 22, 2015

dotdash self-assigned this Jun 22, 2015

dotdash mentioned this issue Jul 20, 2015

Manipulating slice through &mut parameter not optimized well #27130

Closed

dotdash removed their assignment Mar 4, 2016

Mark-Simulacrum added C-enhancement Category: An issue proposing an enhancement or a PR with one. I-slow Issue: Problems and improvements with respect to performance of generated code. labels Jul 22, 2017

Avi-D-coder mentioned this issue Sep 14, 2019

Pre-proposal: Type-aware allocators rust-lang/wg-allocators#27

Closed

jonas-schievink added the T-compiler Relevant to the compiler team, which will review and decide on the PR/issue. label Sep 14, 2019

jonas-schievink mentioned this issue Sep 20, 2020

Return values up to 128 bits in registers #76986

Merged

bors closed this as completed in 62fe055 Sep 27, 2020

erikdesjardins mentioned this issue Nov 30, 2020

Pass arguments up to 2*usize by value #79547

Merged

shampoofactory mentioned this issue Dec 7, 2021

Basic vectorization performance regression from 1.48.0 onwards #85265

Closed

AngelicosPhosphoros mentioned this issue Dec 9, 2021

if Result::is_ok() followed immediately by Result::unwrap() fails to optimize out the second err check #85771

Open

shampoofactory mentioned this issue Dec 9, 2021

[Experiment] revert issue #26494 associated pulls #76986 and #79547 #91719

Closed

guevara mentioned this issue Jan 27, 2022

Why Not Rust? guevara/read-it-later#8279

Open

erikdesjardins mentioned this issue May 30, 2022

Result that uses niches resulting in a final size of 16 bytes emits poor LLVM IR due to ABI #97540

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Rust should use registers more aggressively #26494

Rust should use registers more aggressively #26494

reinerp commented Jun 22, 2015

dotdash commented Jun 22, 2015

arielb1 commented Jun 22, 2015

reinerp commented Jun 22, 2015

pcwalton commented Jul 20, 2015

dotdash commented Jul 20, 2015

nox commented Apr 2, 2018

gnzlbg commented Sep 14, 2019

gnzlbg commented Sep 14, 2019

gnzlbg commented Sep 14, 2019 •

edited

Loading

matklad commented Apr 26, 2020

leonardo-m commented Sep 21, 2020

oilaba commented Jan 1, 2021 •

edited

Loading

matklad commented Jan 3, 2021

shampoofactory commented Dec 7, 2021 •

edited

Loading

Rust should use registers more aggressively #26494

Rust should use registers more aggressively #26494

Comments

reinerp commented Jun 22, 2015

dotdash commented Jun 22, 2015

arielb1 commented Jun 22, 2015

reinerp commented Jun 22, 2015

pcwalton commented Jul 20, 2015

dotdash commented Jul 20, 2015

nox commented Apr 2, 2018

gnzlbg commented Sep 14, 2019

gnzlbg commented Sep 14, 2019

gnzlbg commented Sep 14, 2019 • edited Loading

matklad commented Apr 26, 2020

leonardo-m commented Sep 21, 2020

oilaba commented Jan 1, 2021 • edited Loading

matklad commented Jan 3, 2021

shampoofactory commented Dec 7, 2021 • edited Loading

gnzlbg commented Sep 14, 2019 •

edited

Loading

oilaba commented Jan 1, 2021 •

edited

Loading

shampoofactory commented Dec 7, 2021 •

edited

Loading