From a25f7bdba565c14887ee15a41d575befd37fea0b Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Fri, 28 Jan 2022 16:24:34 -0600 Subject: [PATCH] Don't copy `VMBuiltinFunctionsArray` into each `VMContext` (#3741) * Don't copy `VMBuiltinFunctionsArray` into each `VMContext` This is another PR along the lines of "let's squeeze all possible performance we can out of instantiation". Before this PR we would copy, by value, the contents of `VMBuiltinFunctionsArray` into each `VMContext` allocated. This array of function pointers is modestly-sized but growing over time as we add various intrinsics. Additionally it's the exact same for all `VMContext` allocations. This PR attempts to speed up instantiation slightly by instead storing an indirection to the function array. This means that calling a builtin intrinsic is a tad bit slower since it requires two loads instead of one (one to get the base pointer, another to get the actual address). Otherwise though `VMContext` initialization is now simply setting one pointer instead of doing a `memcpy` from one location to another. With some macro-magic this commit also replaces the previous implementation with one that's more `const`-friendly which also gets us compile-time type-checks of libcalls as well as compile-time verification that all libcalls are defined. Overall, as with #3739, the win is very modest here. Locally I measured a speedup from 1.9us to 1.7us taken to instantiate an empty module with one function. While small at these scales it's still a 10% improvement! * Review comments --- crates/cranelift/src/func_environ.rs | 11 ++- crates/environ/src/vmoffsets.rs | 20 ++---- crates/runtime/src/instance/allocator.rs | 6 +- crates/runtime/src/libcalls.rs | 69 ++++++++++-------- crates/runtime/src/vmcontext.rs | 90 +++++++++--------------- 5 files changed, 86 insertions(+), 110 deletions(-) diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs index 71bcf215de2a..9c1f4afa4310 100644 --- a/crates/cranelift/src/func_environ.rs +++ b/crates/cranelift/src/func_environ.rs @@ -281,10 +281,15 @@ impl<'module_environment> FuncEnvironment<'module_environment> { let mut mem_flags = ir::MemFlags::trusted(); mem_flags.set_readonly(); + // Load the base of the array of builtin functions + let array_offset = i32::try_from(self.offsets.vmctx_builtin_functions()).unwrap(); + let array_addr = pos.ins().load(pointer_type, mem_flags, base, array_offset); + // Load the callee address. - let body_offset = - i32::try_from(self.offsets.vmctx_builtin_function(callee_func_idx)).unwrap(); - let func_addr = pos.ins().load(pointer_type, mem_flags, base, body_offset); + let body_offset = i32::try_from(callee_func_idx.index() * pointer_type.bytes()).unwrap(); + let func_addr = pos + .ins() + .load(pointer_type, mem_flags, array_addr, body_offset); (base, func_addr) } diff --git a/crates/environ/src/vmoffsets.rs b/crates/environ/src/vmoffsets.rs index a2f0b9d558f4..d460fd54e8e5 100644 --- a/crates/environ/src/vmoffsets.rs +++ b/crates/environ/src/vmoffsets.rs @@ -16,12 +16,12 @@ // memories: [VMMemoryDefinition; module.num_defined_memories], // globals: [VMGlobalDefinition; module.num_defined_globals], // anyfuncs: [VMCallerCheckedAnyfunc; module.num_imported_functions + module.num_defined_functions], -// builtins: VMBuiltinFunctionsArray, +// builtins: *mut VMBuiltinFunctionsArray, // } use crate::{ - BuiltinFunctionIndex, DefinedGlobalIndex, DefinedMemoryIndex, DefinedTableIndex, FuncIndex, - GlobalIndex, MemoryIndex, Module, TableIndex, TypeIndex, + DefinedGlobalIndex, DefinedMemoryIndex, DefinedTableIndex, FuncIndex, GlobalIndex, MemoryIndex, + Module, TableIndex, TypeIndex, }; use more_asserts::assert_lt; use std::convert::TryFrom; @@ -287,11 +287,7 @@ impl From> for VMOffsets

{ .unwrap(); ret.size = ret .builtin_functions - .checked_add( - BuiltinFunctionIndex::builtin_functions_total_number() - .checked_mul(u32::from(ret.pointer_size())) - .unwrap(), - ) + .checked_add(u32::from(ret.pointer_size())) .unwrap(); return ret; @@ -597,7 +593,7 @@ impl VMOffsets

{ /// The offset of the builtin functions array. #[inline] - pub fn vmctx_builtin_functions_begin(&self) -> u32 { + pub fn vmctx_builtin_functions(&self) -> u32 { self.builtin_functions } @@ -739,12 +735,6 @@ impl VMOffsets

{ pub fn vmctx_vmglobal_import_from(&self, index: GlobalIndex) -> u32 { self.vmctx_vmglobal_import(index) + u32::from(self.vmglobal_import_from()) } - - /// Return the offset to builtin function in `VMBuiltinFunctionsArray` index `index`. - #[inline] - pub fn vmctx_builtin_function(&self, index: BuiltinFunctionIndex) -> u32 { - self.vmctx_builtin_functions_begin() + index.index() * u32::from(self.pointer_size()) - } } /// Offsets for `VMExternData`. diff --git a/crates/runtime/src/instance/allocator.rs b/crates/runtime/src/instance/allocator.rs index 82c1eec31e8e..63081837fe0a 100644 --- a/crates/runtime/src/instance/allocator.rs +++ b/crates/runtime/src/instance/allocator.rs @@ -481,10 +481,8 @@ unsafe fn initialize_vmcontext(instance: &mut Instance, req: InstanceAllocationR } // Initialize the built-in functions - ptr::write( - instance.vmctx_plus_offset(instance.offsets.vmctx_builtin_functions_begin()), - VMBuiltinFunctionsArray::initialized(), - ); + *instance.vmctx_plus_offset(instance.offsets.vmctx_builtin_functions()) = + &VMBuiltinFunctionsArray::INIT; // Initialize the imports debug_assert_eq!(req.imports.functions.len(), module.num_imported_funcs); diff --git a/crates/runtime/src/libcalls.rs b/crates/runtime/src/libcalls.rs index a6b29116727c..cbb31f03e6d4 100644 --- a/crates/runtime/src/libcalls.rs +++ b/crates/runtime/src/libcalls.rs @@ -185,14 +185,14 @@ pub extern "C" fn wasmtime_f64_nearest(x: f64) -> f64 { } /// Implementation of memory.grow for locally-defined 32-bit memories. -pub unsafe extern "C" fn wasmtime_memory32_grow( +pub unsafe extern "C" fn memory32_grow( vmctx: *mut VMContext, delta: u64, memory_index: u32, -) -> usize { +) -> *mut u8 { // Memory grow can invoke user code provided in a ResourceLimiter{,Async}, // so we need to catch a possible panic - match std::panic::catch_unwind(|| { + let ret = match std::panic::catch_unwind(|| { let instance = (*vmctx).instance_mut(); let memory_index = MemoryIndex::from_u32(memory_index); instance.memory_grow(memory_index, delta) @@ -201,11 +201,12 @@ pub unsafe extern "C" fn wasmtime_memory32_grow( Ok(Ok(None)) => usize::max_value(), Ok(Err(err)) => crate::traphandlers::raise_user_trap(err), Err(p) => resume_panic(p), - } + }; + ret as *mut u8 } /// Implementation of `table.grow`. -pub unsafe extern "C" fn wasmtime_table_grow( +pub unsafe extern "C" fn table_grow( vmctx: *mut VMContext, table_index: u32, delta: u32, @@ -238,8 +239,11 @@ pub unsafe extern "C" fn wasmtime_table_grow( } } +pub use table_grow as table_grow_funcref; +pub use table_grow as table_grow_externref; + /// Implementation of `table.fill`. -pub unsafe extern "C" fn wasmtime_table_fill( +pub unsafe extern "C" fn table_fill( vmctx: *mut VMContext, table_index: u32, dst: u32, @@ -272,8 +276,11 @@ pub unsafe extern "C" fn wasmtime_table_fill( } } +pub use table_fill as table_fill_funcref; +pub use table_fill as table_fill_externref; + /// Implementation of `table.copy`. -pub unsafe extern "C" fn wasmtime_table_copy( +pub unsafe extern "C" fn table_copy( vmctx: *mut VMContext, dst_table_index: u32, src_table_index: u32, @@ -295,7 +302,7 @@ pub unsafe extern "C" fn wasmtime_table_copy( } /// Implementation of `table.init`. -pub unsafe extern "C" fn wasmtime_table_init( +pub unsafe extern "C" fn table_init( vmctx: *mut VMContext, table_index: u32, elem_index: u32, @@ -315,14 +322,14 @@ pub unsafe extern "C" fn wasmtime_table_init( } /// Implementation of `elem.drop`. -pub unsafe extern "C" fn wasmtime_elem_drop(vmctx: *mut VMContext, elem_index: u32) { +pub unsafe extern "C" fn elem_drop(vmctx: *mut VMContext, elem_index: u32) { let elem_index = ElemIndex::from_u32(elem_index); let instance = (*vmctx).instance_mut(); instance.elem_drop(elem_index); } /// Implementation of `memory.copy` for locally defined memories. -pub unsafe extern "C" fn wasmtime_memory_copy( +pub unsafe extern "C" fn memory_copy( vmctx: *mut VMContext, dst_index: u32, dst: u64, @@ -342,7 +349,7 @@ pub unsafe extern "C" fn wasmtime_memory_copy( } /// Implementation of `memory.fill` for locally defined memories. -pub unsafe extern "C" fn wasmtime_memory_fill( +pub unsafe extern "C" fn memory_fill( vmctx: *mut VMContext, memory_index: u32, dst: u64, @@ -360,7 +367,7 @@ pub unsafe extern "C" fn wasmtime_memory_fill( } /// Implementation of `memory.init`. -pub unsafe extern "C" fn wasmtime_memory_init( +pub unsafe extern "C" fn memory_init( vmctx: *mut VMContext, memory_index: u32, data_index: u32, @@ -380,14 +387,14 @@ pub unsafe extern "C" fn wasmtime_memory_init( } /// Implementation of `data.drop`. -pub unsafe extern "C" fn wasmtime_data_drop(vmctx: *mut VMContext, data_index: u32) { +pub unsafe extern "C" fn data_drop(vmctx: *mut VMContext, data_index: u32) { let data_index = DataIndex::from_u32(data_index); let instance = (*vmctx).instance_mut(); instance.data_drop(data_index) } /// Drop a `VMExternRef`. -pub unsafe extern "C" fn wasmtime_drop_externref(externref: *mut u8) { +pub unsafe extern "C" fn drop_externref(externref: *mut u8) { let externref = externref as *mut crate::externref::VMExternData; let externref = NonNull::new(externref).unwrap(); crate::externref::VMExternData::drop_and_dealloc(externref); @@ -395,7 +402,7 @@ pub unsafe extern "C" fn wasmtime_drop_externref(externref: *mut u8) { /// Do a GC and insert the given `externref` into the /// `VMExternRefActivationsTable`. -pub unsafe extern "C" fn wasmtime_activations_table_insert_with_gc( +pub unsafe extern "C" fn activations_table_insert_with_gc( vmctx: *mut VMContext, externref: *mut u8, ) { @@ -416,10 +423,7 @@ pub unsafe extern "C" fn wasmtime_activations_table_insert_with_gc( } /// Perform a Wasm `global.get` for `externref` globals. -pub unsafe extern "C" fn wasmtime_externref_global_get( - vmctx: *mut VMContext, - index: u32, -) -> *mut u8 { +pub unsafe extern "C" fn externref_global_get(vmctx: *mut VMContext, index: u32) -> *mut u8 { let index = GlobalIndex::from_u32(index); let instance = (*vmctx).instance(); let global = instance.defined_or_imported_global_ptr(index); @@ -436,7 +440,7 @@ pub unsafe extern "C" fn wasmtime_externref_global_get( } /// Perform a Wasm `global.set` for `externref` globals. -pub unsafe extern "C" fn wasmtime_externref_global_set( +pub unsafe extern "C" fn externref_global_set( vmctx: *mut VMContext, index: u32, externref: *mut u8, @@ -460,13 +464,14 @@ pub unsafe extern "C" fn wasmtime_externref_global_set( } /// Implementation of `memory.atomic.notify` for locally defined memories. -pub unsafe extern "C" fn wasmtime_memory_atomic_notify( +pub unsafe extern "C" fn memory_atomic_notify( vmctx: *mut VMContext, memory_index: u32, - addr: usize, + addr: *mut u8, _count: u32, ) -> u32 { let result = { + let addr = addr as usize; let memory = MemoryIndex::from_u32(memory_index); let instance = (*vmctx).instance(); // this should never overflow since addr + 4 either hits a guard page @@ -475,7 +480,7 @@ pub unsafe extern "C" fn wasmtime_memory_atomic_notify( let addr_to_check = addr.checked_add(4).unwrap(); validate_atomic_addr(instance, memory, addr_to_check).and_then(|()| { Err(Trap::User(anyhow::anyhow!( - "unimplemented: wasm atomics (fn wasmtime_memory_atomic_notify) unsupported", + "unimplemented: wasm atomics (fn memory_atomic_notify) unsupported", ))) }) }; @@ -486,14 +491,15 @@ pub unsafe extern "C" fn wasmtime_memory_atomic_notify( } /// Implementation of `memory.atomic.wait32` for locally defined memories. -pub unsafe extern "C" fn wasmtime_memory_atomic_wait32( +pub unsafe extern "C" fn memory_atomic_wait32( vmctx: *mut VMContext, memory_index: u32, - addr: usize, + addr: *mut u8, _expected: u32, _timeout: u64, ) -> u32 { let result = { + let addr = addr as usize; let memory = MemoryIndex::from_u32(memory_index); let instance = (*vmctx).instance(); // see wasmtime_memory_atomic_notify for why this shouldn't overflow @@ -501,7 +507,7 @@ pub unsafe extern "C" fn wasmtime_memory_atomic_wait32( let addr_to_check = addr.checked_add(4).unwrap(); validate_atomic_addr(instance, memory, addr_to_check).and_then(|()| { Err(Trap::User(anyhow::anyhow!( - "unimplemented: wasm atomics (fn wasmtime_memory_atomic_wait32) unsupported", + "unimplemented: wasm atomics (fn memory_atomic_wait32) unsupported", ))) }) }; @@ -512,14 +518,15 @@ pub unsafe extern "C" fn wasmtime_memory_atomic_wait32( } /// Implementation of `memory.atomic.wait64` for locally defined memories. -pub unsafe extern "C" fn wasmtime_memory_atomic_wait64( +pub unsafe extern "C" fn memory_atomic_wait64( vmctx: *mut VMContext, memory_index: u32, - addr: usize, + addr: *mut u8, _expected: u64, _timeout: u64, ) -> u32 { let result = { + let addr = addr as usize; let memory = MemoryIndex::from_u32(memory_index); let instance = (*vmctx).instance(); // see wasmtime_memory_atomic_notify for why this shouldn't overflow @@ -527,7 +534,7 @@ pub unsafe extern "C" fn wasmtime_memory_atomic_wait64( let addr_to_check = addr.checked_add(8).unwrap(); validate_atomic_addr(instance, memory, addr_to_check).and_then(|()| { Err(Trap::User(anyhow::anyhow!( - "unimplemented: wasm atomics (fn wasmtime_memory_atomic_wait64) unsupported", + "unimplemented: wasm atomics (fn memory_atomic_wait64) unsupported", ))) }) }; @@ -561,7 +568,7 @@ unsafe fn validate_atomic_addr( } /// Hook for when an instance runs out of fuel. -pub unsafe extern "C" fn wasmtime_out_of_gas(vmctx: *mut VMContext) { +pub unsafe extern "C" fn out_of_gas(vmctx: *mut VMContext) { match (*(*vmctx).instance().store()).out_of_gas() { Ok(()) => {} Err(err) => crate::traphandlers::raise_user_trap(err), @@ -569,7 +576,7 @@ pub unsafe extern "C" fn wasmtime_out_of_gas(vmctx: *mut VMContext) { } /// Hook for when an instance observes that the epoch has changed. -pub unsafe extern "C" fn wasmtime_new_epoch(vmctx: *mut VMContext) -> u64 { +pub unsafe extern "C" fn new_epoch(vmctx: *mut VMContext) -> u64 { match (*(*vmctx).instance().store()).new_epoch() { Ok(new_deadline) => new_deadline, Err(err) => crate::traphandlers::raise_user_trap(err), diff --git a/crates/runtime/src/vmcontext.rs b/crates/runtime/src/vmcontext.rs index 2441e78d64e6..f60ce4723cce 100644 --- a/crates/runtime/src/vmcontext.rs +++ b/crates/runtime/src/vmcontext.rs @@ -9,7 +9,6 @@ use std::marker; use std::ptr::NonNull; use std::sync::atomic::{AtomicUsize, Ordering::SeqCst}; use std::u32; -use wasmtime_environ::BuiltinFunctionIndex; /// An imported function. #[derive(Debug, Copy, Clone)] @@ -594,66 +593,43 @@ mod test_vmcaller_checked_anyfunc { } } -/// An array that stores addresses of builtin functions. We translate code -/// to use indirect calls. This way, we don't have to patch the code. -#[repr(C)] -pub struct VMBuiltinFunctionsArray { - ptrs: [usize; Self::len()], -} +macro_rules! define_builtin_array { + ( + $( + $( #[$attr:meta] )* + $name:ident( $( $param:ident ),* ) -> ( $( $result:ident ),* ); + )* + ) => { + /// An array that stores addresses of builtin functions. We translate code + /// to use indirect calls. This way, we don't have to patch the code. + #[repr(C)] + #[allow(unused_parens)] + pub struct VMBuiltinFunctionsArray { + $( + $name: unsafe extern "C" fn( + $(define_builtin_array!(@ty $param)),* + ) -> ( + $(define_builtin_array!(@ty $result)),* + ), + )* + } -impl VMBuiltinFunctionsArray { - pub const fn len() -> usize { - BuiltinFunctionIndex::builtin_functions_total_number() as usize - } - - pub fn initialized() -> Self { - use crate::libcalls::*; - - let mut ptrs = [0; Self::len()]; - - ptrs[BuiltinFunctionIndex::memory32_grow().index() as usize] = - wasmtime_memory32_grow as usize; - ptrs[BuiltinFunctionIndex::table_copy().index() as usize] = wasmtime_table_copy as usize; - ptrs[BuiltinFunctionIndex::table_grow_funcref().index() as usize] = - wasmtime_table_grow as usize; - ptrs[BuiltinFunctionIndex::table_grow_externref().index() as usize] = - wasmtime_table_grow as usize; - ptrs[BuiltinFunctionIndex::table_init().index() as usize] = wasmtime_table_init as usize; - ptrs[BuiltinFunctionIndex::elem_drop().index() as usize] = wasmtime_elem_drop as usize; - ptrs[BuiltinFunctionIndex::memory_copy().index() as usize] = wasmtime_memory_copy as usize; - ptrs[BuiltinFunctionIndex::memory_fill().index() as usize] = wasmtime_memory_fill as usize; - ptrs[BuiltinFunctionIndex::memory_init().index() as usize] = wasmtime_memory_init as usize; - ptrs[BuiltinFunctionIndex::data_drop().index() as usize] = wasmtime_data_drop as usize; - ptrs[BuiltinFunctionIndex::drop_externref().index() as usize] = - wasmtime_drop_externref as usize; - ptrs[BuiltinFunctionIndex::activations_table_insert_with_gc().index() as usize] = - wasmtime_activations_table_insert_with_gc as usize; - ptrs[BuiltinFunctionIndex::externref_global_get().index() as usize] = - wasmtime_externref_global_get as usize; - ptrs[BuiltinFunctionIndex::externref_global_set().index() as usize] = - wasmtime_externref_global_set as usize; - ptrs[BuiltinFunctionIndex::table_fill_externref().index() as usize] = - wasmtime_table_fill as usize; - ptrs[BuiltinFunctionIndex::table_fill_funcref().index() as usize] = - wasmtime_table_fill as usize; - ptrs[BuiltinFunctionIndex::memory_atomic_notify().index() as usize] = - wasmtime_memory_atomic_notify as usize; - ptrs[BuiltinFunctionIndex::memory_atomic_wait32().index() as usize] = - wasmtime_memory_atomic_wait32 as usize; - ptrs[BuiltinFunctionIndex::memory_atomic_wait64().index() as usize] = - wasmtime_memory_atomic_wait64 as usize; - ptrs[BuiltinFunctionIndex::out_of_gas().index() as usize] = wasmtime_out_of_gas as usize; - ptrs[BuiltinFunctionIndex::new_epoch().index() as usize] = wasmtime_new_epoch as usize; - - if cfg!(debug_assertions) { - for i in 0..ptrs.len() { - debug_assert!(ptrs[i] != 0, "index {} is not initialized", i); - } + impl VMBuiltinFunctionsArray { + pub const INIT: VMBuiltinFunctionsArray = VMBuiltinFunctionsArray { + $($name: crate::libcalls::$name,)* + }; } - Self { ptrs } - } + }; + + (@ty i32) => (u32); + (@ty i64) => (u64); + (@ty reference) => (*mut u8); + (@ty pointer) => (*mut u8); + (@ty vmctx) => (*mut VMContext); } +wasmtime_environ::foreach_builtin_function!(define_builtin_array); + /// The storage for a WebAssembly invocation argument /// /// TODO: These could be packed more densely, rather than using the same size for every type.