Skip to content

Commit

Permalink
Rollup merge of rust-lang#52051 - scottmcm:swap-directly, r=alexcrichton
Browse files Browse the repository at this point in the history
mem::swap the obvious way for types smaller than the SIMD optimization's block size

LLVM isn't able to remove the alloca for the unaligned block in the post-SIMD tail in some cases, so doing this helps SRoA work in cases where it currently doesn't.  Found in the `replace_with` RFC discussion.

Examples of the improvements:
<details>
 <summary>swapping `[u16; 3]` takes 1/3 fewer instructions and no stackalloc</summary>

```rust
type Demo = [u16; 3];
pub fn swap_demo(x: &mut Demo, y: &mut Demo) {
    std::mem::swap(x, y);
}
```

nightly:
```asm
_ZN4blah9swap_demo17ha1732a9b71393a7eE:
.seh_proc _ZN4blah9swap_demo17ha1732a9b71393a7eE
	sub	rsp, 32
	.seh_stackalloc 32
	.seh_endprologue
	movzx	eax, word ptr [rcx + 4]
	mov	word ptr [rsp + 4], ax
	mov	eax, dword ptr [rcx]
	mov	dword ptr [rsp], eax
	movzx	eax, word ptr [rdx + 4]
	mov	word ptr [rcx + 4], ax
	mov	eax, dword ptr [rdx]
	mov	dword ptr [rcx], eax
	movzx	eax, word ptr [rsp + 4]
	mov	word ptr [rdx + 4], ax
	mov	eax, dword ptr [rsp]
	mov	dword ptr [rdx], eax
	add	rsp, 32
	ret
	.seh_handlerdata
	.section	.text,"xr",one_only,_ZN4blah9swap_demo17ha1732a9b71393a7eE
	.seh_endproc
```

this PR:
```asm
_ZN4blah9swap_demo17ha1732a9b71393a7eE:
	mov	r8d, dword ptr [rcx]
	movzx	r9d, word ptr [rcx + 4]
	movzx	eax, word ptr [rdx + 4]
	mov	word ptr [rcx + 4], ax
	mov	eax, dword ptr [rdx]
	mov	dword ptr [rcx], eax
	mov	word ptr [rdx + 4], r9w
	mov	dword ptr [rdx], r8d
	ret
```
</details>

<details>
 <summary>`replace_with` optimizes down much better</summary>

Inspired by rust-lang/rfcs#2490,

```rust
fn replace_with<T, F>(x: &mut Option<T>, f: F)
    where F: FnOnce(Option<T>) -> Option<T>
{
    *x = f(x.take());
}

pub fn inc_opt(mut x: &mut Option<i32>) {
    replace_with(&mut x, |i| i.map(|j| j + 1));
}
```

Rust 1.26.0:
```asm
_ZN4blah7inc_opt17heb0acb64c51777cfE:
	mov	rax, qword ptr [rcx]
	movabs	r8, 4294967296
	add	r8, rax
	shl	rax, 32
	movabs	rdx, -4294967296
	and	rdx, r8
	xor	r8d, r8d
	test	rax, rax
	cmove	rdx, rax
	setne	r8b
	or	rdx, r8
	mov	qword ptr [rcx], rdx
	ret
```

Nightly (better thanks to ScalarPair, maybe?):
```asm
_ZN4blah7inc_opt17h66df690be0b5899dE:
	mov	r8, qword ptr [rcx]
	mov	rdx, r8
	shr	rdx, 32
	xor	eax, eax
	test	r8d, r8d
	setne	al
	add	edx, 1
	mov	dword ptr [rcx], eax
	mov	dword ptr [rcx + 4], edx
	ret
```

This PR:
```asm
_ZN4blah7inc_opt17h1426dc215ecbdb19E:
	xor	eax, eax
	cmp	dword ptr [rcx], 0
	setne	al
	mov	dword ptr [rcx], eax
	add	dword ptr [rcx + 4], 1
	ret
```

Where that add is beautiful -- using an addressing mode to not even need to explicitly go through a register -- and the remaining imperfection is well-known (rust-lang#49420 (comment)).
</details>
  • Loading branch information
kennytm committed Jul 22, 2018
2 parents 7019cce + c9482f7 commit b954d4d
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/libcore/mem.rs
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,7 @@ pub unsafe fn uninitialized<T>() -> T {
#[stable(feature = "rust1", since = "1.0.0")]
pub fn swap<T>(x: &mut T, y: &mut T) {
unsafe {
ptr::swap_nonoverlapping(x, y, 1);
ptr::swap_nonoverlapping_one(x, y);
}
}

Expand Down
13 changes: 13 additions & 0 deletions src/libcore/ptr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,19 @@ pub unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
swap_nonoverlapping_bytes(x, y, len)
}

#[inline]
pub(crate) unsafe fn swap_nonoverlapping_one<T>(x: *mut T, y: *mut T) {
// For types smaller than the block optimization below,
// just swap directly to avoid pessimizing codegen.
if mem::size_of::<T>() < 32 {
let z = read(x);
copy_nonoverlapping(y, x, 1);
write(y, z);
} else {
swap_nonoverlapping(x, y, 1);
}
}

#[inline]
unsafe fn swap_nonoverlapping_bytes(x: *mut u8, y: *mut u8, len: usize) {
// The approach here is to utilize simd to swap x & y efficiently. Testing reveals
Expand Down
27 changes: 27 additions & 0 deletions src/test/codegen/swap-small-types.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright 2018 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

// compile-flags: -O
// only-x86_64

#![crate_type = "lib"]

use std::mem::swap;

type RGB48 = [u16; 3];

// CHECK-LABEL: @swap_rgb48
#[no_mangle]
pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {
// CHECK-NOT: alloca
// CHECK: load i48
// CHECK: store i48
swap(x, y)
}

0 comments on commit b954d4d

Please sign in to comment.