Skip to content

Commit

Permalink
Combine stack-based cleanups for faster wasm calls
Browse files Browse the repository at this point in the history
This commit is an extension of bytecodealliance#2757 where the goal is to optimize entry
into WebAssembly. Currently wasmtime has two stack-based cleanups when
entering wasm, one for the externref activation table and another for
stack limits getting reset. This commit fuses these two cleanups
together into one and moves some code around which enables less captures
for fewer closures and such to speed up calls in to wasm a bit more.
Overall this drops the execution time from 88ns to 80ns locally for me.

This also updates the atomic orderings when updating the stack limit
from `SeqCst` to `Relaxed`. While `SeqCst` is a reasonable starting
point the usage here should be safe to use `Relaxed` since we're not
using the atomics to actually protect any memory, it's simply receiving
signals from other threads.
  • Loading branch information
alexcrichton committed Mar 23, 2021
1 parent c95971a commit 31b6857
Show file tree
Hide file tree
Showing 9 changed files with 112 additions and 181 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion crates/runtime/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ more-asserts = "0.2.1"
cfg-if = "1.0"
backtrace = "0.3.55"
lazy_static = "1.3.0"
psm = "0.1.11"
rand = "0.8.3"
anyhow = "1.0.38"

Expand Down
79 changes: 11 additions & 68 deletions crates/runtime/src/externref.rs
Original file line number Diff line number Diff line change
Expand Up @@ -522,13 +522,13 @@ pub struct VMExternRefActivationsTable {
/// than create a new hash set every GC.
precise_stack_roots: RefCell<HashSet<VMExternRefWithTraits>>,

/// A pointer to a `u8` on the youngest host stack frame before we called
/// A pointer to the youngest host stack frame before we called
/// into Wasm for the first time. When walking the stack in garbage
/// collection, if we don't find this frame, then we failed to walk every
/// Wasm stack frame, which means we failed to find all on-stack,
/// inside-a-Wasm-frame roots, and doing a GC could lead to freeing one of
/// those missed roots, and use after free.
stack_canary: Cell<Option<NonNull<u8>>>,
stack_canary: Cell<Option<usize>>,
}

impl VMExternRefActivationsTable {
Expand Down Expand Up @@ -717,73 +717,16 @@ impl VMExternRefActivationsTable {
}
}

/// Set the stack canary around a call into Wasm.
///
/// The return value should not be dropped until after the Wasm call has
/// returned.
///
/// While this method is always safe to call (or not call), it is unsafe to
/// call the `wasmtime_runtime::gc` function unless this method is called at
/// the proper times and its return value properly outlives its Wasm call.
///
/// For `gc` to be safe, this is only *strictly required* to surround the
/// oldest host-->Wasm stack frame transition on this thread, but repeatedly
/// calling it is idempotent and cheap, so it is recommended to call this
/// for every host-->Wasm call.
///
/// # Example
///
/// ```no_run
/// use wasmtime_runtime::*;
///
/// # let get_table_from_somewhere = || unimplemented!();
/// let table: &VMExternRefActivationsTable = get_table_from_somewhere();
///
/// // Set the canary before a Wasm call. The canary should always be a
/// // local on the stack.
/// let canary = 0;
/// let auto_reset_canary = table.set_stack_canary(&canary);
///
/// // Do the call into Wasm.
/// # let call_into_wasm = || unimplemented!();
/// call_into_wasm();
///
/// // Only drop the value returned by `set_stack_canary` after the Wasm
/// // call has returned.
/// drop(auto_reset_canary);
/// ```
/// todo
#[inline]
pub fn set_stack_canary<'a>(&'a self, canary: &u8) -> impl Drop + 'a {
let should_reset = if self.stack_canary.get().is_none() {
let canary = canary as *const u8 as *mut u8;
self.stack_canary.set(Some(unsafe {
debug_assert!(!canary.is_null());
NonNull::new_unchecked(canary)
}));
true
} else {
false
};

return AutoResetCanary {
table: self,
should_reset,
};

struct AutoResetCanary<'a> {
table: &'a VMExternRefActivationsTable,
should_reset: bool,
}
pub fn stack_canary(&self) -> Option<usize> {
self.stack_canary.get()
}

impl Drop for AutoResetCanary<'_> {
#[inline]
fn drop(&mut self) {
if self.should_reset {
debug_assert!(self.table.stack_canary.get().is_some());
self.table.stack_canary.set(None);
}
}
}
/// todo
#[inline]
pub fn set_stack_canary(&self, canary: Option<usize>) {
self.stack_canary.set(canary);
}
}

Expand Down Expand Up @@ -1066,7 +1009,7 @@ pub unsafe fn gc(
log::debug!("end GC");
return;
}
Some(canary) => canary.as_ptr() as usize,
Some(canary) => canary,
};

// There is a stack canary, so there must be Wasm frames on the stack. The
Expand Down
99 changes: 1 addition & 98 deletions crates/runtime/src/traphandlers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use std::cell::{Cell, UnsafeCell};
use std::error::Error;
use std::mem::MaybeUninit;
use std::ptr;
use std::sync::atomic::{AtomicUsize, Ordering::SeqCst};
use std::sync::atomic::Ordering::SeqCst;
use std::sync::Once;
use wasmtime_environ::ir;

Expand Down Expand Up @@ -217,10 +217,6 @@ pub unsafe trait TrapInfo {
/// Returns `true` if `call` returns true, otherwise returns `false`.
fn custom_signal_handler(&self, call: &dyn Fn(&SignalHandler) -> bool) -> bool;

/// Returns the maximum size, in bytes, the wasm native stack is allowed to
/// grow to.
fn max_wasm_stack(&self) -> usize;

/// Callback invoked whenever WebAssembly has entirely consumed the fuel
/// that it was allotted.
///
Expand Down Expand Up @@ -251,7 +247,6 @@ impl<'a> CallThreadState<'a> {
}

fn with(self, closure: impl FnOnce(&CallThreadState) -> i32) -> Result<(), Trap> {
let _reset = self.update_stack_limit()?;
let ret = tls::set(&self, || closure(&self));
if ret != 0 {
return Ok(());
Expand All @@ -273,98 +268,6 @@ impl<'a> CallThreadState<'a> {
}
}

/// Checks and/or initializes the wasm native call stack limit.
///
/// This function will inspect the current state of the stack and calling
/// context to determine which of three buckets we're in:
///
/// 1. We are the first wasm call on the stack. This means that we need to
/// set up a stack limit where beyond which if the native wasm stack
/// pointer goes beyond forces a trap. For now we simply reserve an
/// arbitrary chunk of bytes (1 MB from roughly the current native stack
/// pointer). This logic will likely get tweaked over time.
///
/// 2. We aren't the first wasm call on the stack. In this scenario the wasm
/// stack limit is already configured. This case of wasm -> host -> wasm
/// we assume that the native stack consumed by the host is accounted for
/// in the initial stack limit calculation. That means that in this
/// scenario we do nothing.
///
/// 3. We were previously interrupted. In this case we consume the interrupt
/// here and return a trap, clearing the interrupt and allowing the next
/// wasm call to proceed.
///
/// The return value here is a trap for case 3, a noop destructor in case 2,
/// and a meaningful destructor in case 1
///
/// For more information about interrupts and stack limits see
/// `crates/environ/src/cranelift.rs`.
///
/// Note that this function must be called with `self` on the stack, not the
/// heap/etc.
#[inline]
fn update_stack_limit(&self) -> Result<impl Drop + '_, Trap> {
// Determine the stack pointer where, after which, any wasm code will
// immediately trap. This is checked on the entry to all wasm functions.
//
// Note that this isn't 100% precise. We are requested to give wasm
// `max_wasm_stack` bytes, but what we're actually doing is giving wasm
// probably a little less than `max_wasm_stack` because we're
// calculating the limit relative to this function's approximate stack
// pointer. Wasm will be executed on a frame beneath this one (or next
// to it). In any case it's expected to be at most a few hundred bytes
// of slop one way or another. When wasm is typically given a MB or so
// (a million bytes) the slop shouldn't matter too much.
let wasm_stack_limit = psm::stack_pointer() as usize - self.trap_info.max_wasm_stack();

let interrupts = self.trap_info.interrupts();
let reset_stack_limit = match interrupts.stack_limit.compare_exchange(
usize::max_value(),
wasm_stack_limit,
SeqCst,
SeqCst,
) {
Ok(_) => {
// We're the first wasm on the stack so we've now reserved the
// `max_wasm_stack` bytes of native stack space for wasm.
// Nothing left to do here now except reset back when we're
// done.
true
}
Err(n) if n == wasmtime_environ::INTERRUPTED => {
// This means that an interrupt happened before we actually
// called this function, which means that we're now
// considered interrupted. Be sure to consume this interrupt
// as part of this process too.
interrupts.stack_limit.store(usize::max_value(), SeqCst);
return Err(Trap::Wasm {
trap_code: ir::TrapCode::Interrupt,
backtrace: Backtrace::new_unresolved(),
});
}
Err(_) => {
// The stack limit was previously set by a previous wasm
// call on the stack. We leave the original stack limit for
// wasm in place in that case, and don't reset the stack
// limit when we're done.
false
}
};

struct Reset<'a>(bool, &'a AtomicUsize);

impl Drop for Reset<'_> {
#[inline]
fn drop(&mut self) {
if self.0 {
self.1.store(usize::max_value(), SeqCst);
}
}
}

Ok(Reset(reset_stack_limit, &interrupts.stack_limit))
}

fn unwind_with(&self, reason: UnwindReason) -> ! {
unsafe {
(*self.unwind.get()).as_mut_ptr().write(reason);
Expand Down
1 change: 1 addition & 0 deletions crates/wasmtime/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ serde = { version = "1.0.94", features = ["derive"] }
bincode = "1.2.1"
indexmap = "1.6"
paste = "1.0.3"
psm = "0.1.11"

[target.'cfg(target_os = "windows")'.dependencies]
winapi = "0.3.7"
Expand Down
98 changes: 94 additions & 4 deletions crates/wasmtime/src/func.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use std::mem;
use std::panic::{self, AssertUnwindSafe};
use std::pin::Pin;
use std::ptr::{self, NonNull};
use std::sync::atomic::Ordering::Relaxed;
use wasmtime_environ::wasm::{EntityIndex, FuncIndex};
use wasmtime_runtime::{
raise_user_trap, ExportFunction, InstanceAllocator, InstanceHandle, OnDemandInstanceAllocator,
Expand Down Expand Up @@ -1149,20 +1150,109 @@ impl fmt::Debug for Func {
}
}

#[inline]
pub(crate) fn invoke_wasm_and_catch_traps(
store: &Store,
closure: impl FnMut(),
) -> Result<(), Trap> {
unsafe {
let canary = 0;
let _auto_reset_canary = store
.externref_activations_table()
.set_stack_canary(&canary);
let _reset = if store.externref_activations_table().stack_canary().is_some() {
None
} else {
Some(enter_wasm_init(store)?)
};

wasmtime_runtime::catch_traps(store, closure).map_err(|e| Trap::from_runtime(store, e))
}
}

/// This function is called to register state within `Store` whenever
/// WebAssembly is entered for the first time within the `Store`. This isn't
/// called when wasm is called recursively within the `Store`.
///
/// This function sets up various limits such as:
///
/// * The stack limit. This is what ensures that we limit the stack space
/// allocated by WebAssembly code and it's relative to the initial stack
/// pointer that called into wasm.
///
/// * Stack canaries for externref gc tracing. Currently the implementation
/// relies on walking frames but the stack walker isn't always 100% reliable,
/// so a canary is used to ensure that if the canary is seen then it's
/// guaranteed all wasm frames have been walked.
///
/// This function may fail if the the stack limit can't be set because an
/// interrupt already happened. Otherwise it returns a value that resets the
/// various limits on `Drop`.
#[inline]
fn enter_wasm_init<'a>(store: &'a Store) -> Result<impl Drop + 'a, Trap> {
let stack_pointer = psm::stack_pointer() as usize;

// Determine the stack pointer where, after which, any wasm code will
// immediately trap. This is checked on the entry to all wasm functions.
//
// Note that this isn't 100% precise. We are requested to give wasm
// `max_wasm_stack` bytes, but what we're actually doing is giving wasm
// probably a little less than `max_wasm_stack` because we're
// calculating the limit relative to this function's approximate stack
// pointer. Wasm will be executed on a frame beneath this one (or next
// to it). In any case it's expected to be at most a few hundred bytes
// of slop one way or another. When wasm is typically given a MB or so
// (a million bytes) the slop shouldn't matter too much.
//
// After we've got the stack limit then we store it into the `stack_limit`
// variable. Note that the store is an atomic swap to ensure that we can
// consume any previously-sent interrupt requests. If we found that wasm was
// previously interrupted then we immediately return a trap (after resetting
// the stack limit). Otherwise we're good to keep on going.
//
// Note the usage of `Relaxed` memory orderings here. This is specifically
// an optimization in the `Drop` below where a `Relaxed` store is speedier
// than a `SeqCst` store. The rationale for `Relaxed` here is that the
// atomic orderings here aren't actually protecting any memory, we're just
// trying to be atomic with respect to this one location in memory (for when
// `InterruptHandle` sends us a signal). Due to the lack of needing to
// synchronize with any other memory it's hoped that the choice of `Relaxed`
// here should be correct for our use case.
let wasm_stack_limit = stack_pointer - store.engine().config().max_wasm_stack;
let interrupts = store.interrupts();
match interrupts.stack_limit.swap(wasm_stack_limit, Relaxed) {
wasmtime_environ::INTERRUPTED => {
// This means that an interrupt happened before we actually
// called this function, which means that we're now
// considered interrupted.
interrupts.stack_limit.store(usize::max_value(), Relaxed);
return Err(Trap::new_wasm(
Some(store),
None,
wasmtime_environ::ir::TrapCode::Interrupt,
backtrace::Backtrace::new_unresolved(),
));
}
n => debug_assert_eq!(usize::max_value(), n),
}
store
.externref_activations_table()
.set_stack_canary(Some(stack_pointer));

return Ok(Reset(store));

struct Reset<'a>(&'a Store);

impl Drop for Reset<'_> {
#[inline]
fn drop(&mut self) {
self.0.externref_activations_table().set_stack_canary(None);

// see docs above for why this uses `Relaxed`
self.0
.interrupts()
.stack_limit
.store(usize::max_value(), Relaxed);
}
}
}

/// A trait implemented for types which can be returned from closures passed to
/// [`Func::wrap`] and friends.
///
Expand Down
Loading

0 comments on commit 31b6857

Please sign in to comment.