Skip to content

Commit

Permalink
Tweak parallelism and the instantiation benchmark (bytecodealliance#3775
Browse files Browse the repository at this point in the history
)

Currently the "sequential" and "parallel" benchmarks reports somewhat
different timings. For sequential it's time-to-instantiate, but for
parallel it's time-to-instantiate-10k instances. The parallelism in the
parallel benchmark can also theoretically be affected by rayon's
work-stealing. For example if rayon doesn't actually do any work
stealing at all then this ends up being a sequential test again.
Otherwise though it's possible for some threads to finish much earlier
as rayon isn't guaranteed to keep threads busy.

This commit applies a few updates to the benchmark:

* First an `InstancePre<T>` is now used instead of a `Linker<T>` to
  front-load type-checking and avoid that on each instantiation (and
  this is generally the fastest path to instantiate right now).

* Next the instantiation benchmark is changed to measure one
  instantiation-per-iteration to measure per-instance instantiation to
  better compare with sequential numbers.

* Finally rayon is removed in favor of manually creating background
  threads that infinitely do work until we tell them to stop. These
  background threads are guaranteed to be working for the entire time
  the benchmark is executing and should theoretically exhibit what the
  situation that there's N units of work all happening at once.

I also applied some minor updates here such as having the parallel
instantiation defined conditionally for multiple modules as well as
upping the limits of the pooling allocator to handle a large module
(rustpython.wasm) that I threw at it.
  • Loading branch information
alexcrichton authored and mpardesh committed Mar 17, 2022
1 parent f81a663 commit da6d0d8
Showing 1 changed file with 114 additions and 65 deletions.
179 changes: 114 additions & 65 deletions benches/instantiation.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
use anyhow::Result;
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
use rayon::{prelude::*, ThreadPoolBuilder};
use std::{path::PathBuf, process::Command};
use std::path::{Path, PathBuf};
use std::process::Command;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering::SeqCst};
use std::sync::Arc;
use std::thread;
use wasmtime::*;
use wasmtime_wasi::{sync::WasiCtxBuilder, WasiCtx};

fn instantiate(linker: &Linker<WasiCtx>, module: &Module) -> Result<()> {
fn store(engine: &Engine) -> Store<WasiCtx> {
let wasi = WasiCtxBuilder::new().build();
let mut store = Store::new(module.engine(), wasi);
let _instance = linker.instantiate(&mut store, module)?;
Store::new(engine, wasi)
}

fn instantiate(pre: &InstancePre<WasiCtx>, engine: &Engine) -> Result<()> {
let mut store = store(engine);
let _instance = pre.instantiate(&mut store)?;

Ok(())
}
Expand All @@ -23,84 +30,105 @@ fn benchmark_name<'a>(strategy: &InstanceAllocationStrategy) -> &'static str {
}
}

fn bench_sequential(c: &mut Criterion, modules: &[&str]) {
fn bench_sequential(c: &mut Criterion, path: &Path) {
let mut group = c.benchmark_group("sequential");

for strategy in &[
// Skip the on-demand allocator when uffd is enabled
#[cfg(any(not(feature = "uffd"), not(target_os = "linux")))]
InstanceAllocationStrategy::OnDemand,
InstanceAllocationStrategy::pooling(),
] {
for file_name in modules {
let mut path = PathBuf::new();
path.push("benches");
path.push("instantiation");
path.push(file_name);

let mut config = Config::default();
config.allocation_strategy(strategy.clone());

let engine = Engine::new(&config).expect("failed to create engine");
let module = Module::from_file(&engine, &path)
.unwrap_or_else(|_| panic!("failed to load benchmark `{}`", path.display()));
let mut linker = Linker::new(&engine);
wasmtime_wasi::add_to_linker(&mut linker, |cx| cx).unwrap();

group.bench_function(BenchmarkId::new(benchmark_name(strategy), file_name), |b| {
b.iter(|| instantiate(&linker, &module).expect("failed to instantiate module"));
});
}
for strategy in strategies() {
let mut config = Config::default();
config.allocation_strategy(strategy.clone());

let engine = Engine::new(&config).expect("failed to create engine");
let module = Module::from_file(&engine, path)
.unwrap_or_else(|e| panic!("failed to load benchmark `{}`: {:?}", path.display(), e));
let mut linker = Linker::new(&engine);
wasmtime_wasi::add_to_linker(&mut linker, |cx| cx).unwrap();
let pre = linker
.instantiate_pre(&mut store(&engine), &module)
.expect("failed to pre-instantiate");

group.bench_function(
BenchmarkId::new(
benchmark_name(&strategy),
path.file_name().unwrap().to_str().unwrap(),
),
|b| {
b.iter(|| instantiate(&pre, &engine).expect("failed to instantiate module"));
},
);
}

group.finish();
}

fn bench_parallel(c: &mut Criterion) {
const PARALLEL_INSTANCES: usize = 1000;

fn bench_parallel(c: &mut Criterion, path: &Path) {
let mut group = c.benchmark_group("parallel");

for strategy in &[
// Skip the on-demand allocator when uffd is enabled
#[cfg(any(not(feature = "uffd"), not(target_os = "linux")))]
InstanceAllocationStrategy::OnDemand,
InstanceAllocationStrategy::pooling(),
] {
for strategy in strategies() {
let mut config = Config::default();
config.allocation_strategy(strategy.clone());

let engine = Engine::new(&config).expect("failed to create engine");
let module = Module::from_file(&engine, "benches/instantiation/wasi.wasm")
.expect("failed to load WASI example module");
let module = Module::from_file(&engine, path).expect("failed to load WASI example module");
let mut linker = Linker::new(&engine);
wasmtime_wasi::add_to_linker(&mut linker, |cx| cx).unwrap();
let pre = Arc::new(
linker
.instantiate_pre(&mut store(&engine), &module)
.expect("failed to pre-instantiate"),
);

for threads in 1..=num_cpus::get_physical() {
let pool = ThreadPoolBuilder::new()
.num_threads(threads)
.build()
.unwrap();

group.bench_function(
BenchmarkId::new(
benchmark_name(strategy),
benchmark_name(&strategy),
format!(
"{} instances with {} thread{}",
PARALLEL_INSTANCES,
"{}: with {} background thread{}",
path.file_name().unwrap().to_str().unwrap(),
threads,
if threads == 1 { "" } else { "s" }
),
),
|b| {
b.iter(|| {
pool.install(|| {
(0..PARALLEL_INSTANCES).into_par_iter().for_each(|_| {
instantiate(&linker, &module)
.expect("failed to instantiate module");
// Spin up N-1 threads doing background instantiations to
// simulate concurrent instantiations.
let done = Arc::new(AtomicBool::new(false));
let count = Arc::new(AtomicUsize::new(0));
let workers = (0..threads - 1)
.map(|_| {
let pre = pre.clone();
let done = done.clone();
let engine = engine.clone();
let count = count.clone();
thread::spawn(move || {
count.fetch_add(1, SeqCst);
while !done.load(SeqCst) {
instantiate(&pre, &engine).unwrap();
}
})
})
.collect::<Vec<_>>();

// Wait for our workers to all get started and have
// instantiated their first module, at which point they'll
// all be spinning.
while count.load(SeqCst) != threads - 1 {
thread::yield_now();
}

// Now that our background work is configured we can
// benchmark the amount of time it takes to instantiate this
// module.
b.iter(|| {
instantiate(&pre, &engine).expect("failed to instantiate module");
});

// Shut down this benchmark iteration by signalling to
// worker threads they should exit and then wait for them to
// have reached the exit point.
done.store(true, SeqCst);
for t in workers {
t.join().unwrap();
}
},
);
}
Expand Down Expand Up @@ -138,16 +166,37 @@ fn build_wasi_example() {

fn bench_instantiation(c: &mut Criterion) {
build_wasi_example();
bench_sequential(
c,
&[
"empty.wat",
"small_memory.wat",
"data_segments.wat",
"wasi.wasm",
],
);
bench_parallel(c);
let modules = &[
"empty.wat",
"small_memory.wat",
"data_segments.wat",
"wasi.wasm",
];
for module in modules {
let mut path = PathBuf::new();
path.push("benches");
path.push("instantiation");
path.push(module);
bench_sequential(c, &path);
bench_parallel(c, &path);
}
}

fn strategies() -> impl Iterator<Item = InstanceAllocationStrategy> {
std::array::IntoIter::new([
// Skip the on-demand allocator when uffd is enabled
#[cfg(any(not(feature = "uffd"), not(target_os = "linux")))]
InstanceAllocationStrategy::OnDemand,
InstanceAllocationStrategy::Pooling {
strategy: Default::default(),
module_limits: ModuleLimits {
functions: 20_000,
memory_pages: 1_000,
..ModuleLimits::default()
},
instance_limits: InstanceLimits::default(),
},
])
}

criterion_group!(benches, bench_instantiation);
Expand Down

0 comments on commit da6d0d8

Please sign in to comment.