Browse Source

Add a benchmark for measuring call overhead (#3883)

The goal of this new benchmark, `call`, is to help us measure the
overhead of both calling into WebAssembly from the host as well as
calling the host from WebAssembly. There's lots of various ways to
measure this so this benchmark is a bit large but should hopefully be
pretty thorough. It's expected that this benchmark will rarely be run in
its entirety but rather only a subset of the benchmarks will be run at
any one given time.

Some metrics measured here are:

* Typed vs Untyped vs Unchecked - testing the cost of both calling wasm
  with these various methods as well as having wasm call the host where
  the host function is defined with these various methods.

* With and without `call_hook` - helps to measure the overhead of the
  `Store::call_hook` API.

* Synchronous and Asynchronous - measures the overhead of calling into
  WebAssembly asynchronously (with and without the pooling allocator) in
  addition to defining host APIs in various methods when wasm is called
  asynchronously.

Currently all the numbers are as expected, notably:

* Host calling WebAssembly is ~25ns of overhead
* WebAssembly calling the host is ~3ns of overhead
* "Unchecked" is a bit slower than "typed", and "Untyped" is slower than
  unchecked.
* Asynchronous wasm calling a synchronous host function has ~3ns of
  overhead (nothing more than usual).
* Asynchronous calls are much slower, on the order of 2-3us due to
  `madvise`.

Lots of other fiddly bits that can be measured here, but this will
hopefully help establish a benchmark through which we can measure in the
future in addition to measuring changes such as #3876
pull/3885/head
Alex Crichton 3 years ago
committed by GitHub
parent
commit
8c9c72caaa
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 4
      Cargo.toml
  2. 470
      benches/call.rs

4
Cargo.toml

@ -135,3 +135,7 @@ harness = false
[[bench]]
name = "thread_eager_init"
harness = false
[[bench]]
name = "call"
harness = false

470
benches/call.rs

@ -0,0 +1,470 @@
use criterion::measurement::WallTime;
use criterion::{criterion_group, criterion_main, BenchmarkGroup, Criterion};
use std::fmt::Debug;
use std::future::Future;
use std::pin::Pin;
use std::task::{Context, Poll, RawWaker, RawWakerVTable, Waker};
use std::time::Instant;
use wasmtime::*;
criterion_main!(benches);
criterion_group!(benches, measure_execution_time);
fn measure_execution_time(c: &mut Criterion) {
host_to_wasm(c);
wasm_to_host(c);
}
#[derive(Copy, Clone)]
enum IsAsync {
Yes,
YesPooling,
No,
}
impl IsAsync {
fn desc(&self) -> &str {
match self {
IsAsync::Yes => "async",
IsAsync::YesPooling => "async-pool",
IsAsync::No => "sync",
}
}
fn use_async(&self) -> bool {
match self {
IsAsync::Yes | IsAsync::YesPooling => true,
IsAsync::No => false,
}
}
}
fn engines() -> Vec<(Engine, IsAsync)> {
let mut config = Config::new();
vec![
(Engine::new(&config).unwrap(), IsAsync::No),
(
Engine::new(config.async_support(true)).unwrap(),
IsAsync::Yes,
),
(
Engine::new(config.allocation_strategy(InstanceAllocationStrategy::pooling())).unwrap(),
IsAsync::YesPooling,
),
]
}
/// Benchmarks the overhead of calling WebAssembly from the host in various
/// configurations.
fn host_to_wasm(c: &mut Criterion) {
for (engine, is_async) in engines() {
let mut store = Store::new(&engine, ());
let module = Module::new(
&engine,
r#"(module
(func (export "nop"))
(func (export "nop-params-and-results") (param i32 i64) (result f32)
f32.const 0)
)"#,
)
.unwrap();
let instance = if is_async.use_async() {
run_await(Instance::new_async(&mut store, &module, &[])).unwrap()
} else {
Instance::new(&mut store, &module, &[]).unwrap()
};
let bench_calls = |group: &mut BenchmarkGroup<'_, WallTime>, store: &mut Store<()>| {
// Bench the overhead of a function that has no parameters or results
bench_host_to_wasm::<(), ()>(group, store, &instance, is_async, "nop", (), ());
// Bench the overhead of a function that has some parameters and just
// one result (will use the raw system-v convention on applicable
// platforms).
bench_host_to_wasm::<(i32, i64), (f32,)>(
group,
store,
&instance,
is_async,
"nop-params-and-results",
(0, 0),
(0.0,),
);
};
// Bench once without any call hooks configured
let name = format!("{}/no-hook", is_async.desc());
bench_calls(&mut c.benchmark_group(&name), &mut store);
// Bench again with a "call hook" enabled
store.call_hook(|_, _| Ok(()));
let name = format!("{}/hook-sync", is_async.desc());
bench_calls(&mut c.benchmark_group(&name), &mut store);
}
}
fn bench_host_to_wasm<Params, Results>(
c: &mut BenchmarkGroup<'_, WallTime>,
store: &mut Store<()>,
instance: &Instance,
is_async: IsAsync,
name: &str,
typed_params: Params,
typed_results: Results,
) where
Params: WasmParams + ToVals + Copy,
Results: WasmResults + ToVals + Copy + PartialEq + Debug,
{
// Benchmark the "typed" version, which should be faster than the versions
// below.
c.bench_function(&format!("host-to-wasm - typed - {}", name), |b| {
let typed = instance
.get_typed_func::<Params, Results, _>(&mut *store, name)
.unwrap();
b.iter(|| {
let results = if is_async.use_async() {
run_await(typed.call_async(&mut *store, typed_params)).unwrap()
} else {
typed.call(&mut *store, typed_params).unwrap()
};
assert_eq!(results, typed_results);
})
});
// Benchmark the "untyped" version which should be the slowest of the three
// here, but not unduly slow.
c.bench_function(&format!("host-to-wasm - untyped - {}", name), |b| {
let untyped = instance.get_func(&mut *store, name).unwrap();
let params = typed_params.to_vals();
let expected_results = typed_results.to_vals();
let mut results = vec![Val::I32(0); expected_results.len()];
b.iter(|| {
if is_async.use_async() {
run_await(untyped.call_async(&mut *store, &params, &mut results)).unwrap();
} else {
untyped.call(&mut *store, &params, &mut results).unwrap();
}
for (expected, actual) in expected_results.iter().zip(&results) {
assert_vals_eq(expected, actual);
}
})
});
// Currently `call_async_unchecked` isn't implemented, so can't benchmark
// below
if is_async.use_async() {
return;
}
// Benchmark the "unchecked" version which should be between the above two,
// but is unsafe.
c.bench_function(&format!("host-to-wasm - unchecked - {}", name), |b| {
let untyped = instance.get_func(&mut *store, name).unwrap();
let params = typed_params.to_vals();
let results = typed_results.to_vals();
let mut space = vec![ValRaw { i32: 0 }; params.len().max(results.len())];
b.iter(|| unsafe {
for (i, param) in params.iter().enumerate() {
space[i] = param.to_raw(&mut *store);
}
untyped
.call_unchecked(&mut *store, space.as_mut_ptr())
.unwrap();
for (i, expected) in results.iter().enumerate() {
assert_vals_eq(
expected,
&Val::from_raw(&mut *store, space[i], expected.ty()),
);
}
})
});
}
/// Benchmarks the overhead of calling the host from WebAssembly itself
fn wasm_to_host(c: &mut Criterion) {
let module = r#"(module
;; host imports with a variety of parameters/arguments
(import "" "nop" (func $nop))
(import "" "nop-params-and-results"
(func $nop_params_and_results (param i32 i64) (result f32))
)
;; "runner functions" for each of the above imports. Each runner
;; function takes the number of times to call the host function as
;; the duration of this entire loop will be measured.
(func (export "run-nop") (param i64)
loop
call $nop
local.get 0 ;; decrement & break if necessary
i64.const -1
i64.add
local.tee 0
i64.const 0
i64.ne
br_if 0
end
)
(func (export "run-nop-params-and-results") (param i64)
loop
i32.const 0 ;; always zero parameters
i64.const 0
call $nop_params_and_results
f32.const 0 ;; assert the correct result
f32.eq
i32.eqz
if
unreachable
end
local.get 0 ;; decrement & break if necessary
i64.const -1
i64.add
local.tee 0
i64.const 0
i64.ne
br_if 0
end
)
)"#;
for (engine, is_async) in engines() {
let mut store = Store::new(&engine, ());
let module = Module::new(&engine, module).unwrap();
bench_calls(
&mut c.benchmark_group(&format!("{}/no-hook", is_async.desc())),
&mut store,
&module,
is_async,
);
store.call_hook(|_, _| Ok(()));
bench_calls(
&mut c.benchmark_group(&format!("{}/hook-sync", is_async.desc())),
&mut store,
&module,
is_async,
);
}
// Given a `Store` will create various instances hooked up to different ways
// of defining host imports to benchmark their overhead.
fn bench_calls(
group: &mut BenchmarkGroup<'_, WallTime>,
store: &mut Store<()>,
module: &Module,
is_async: IsAsync,
) {
let engine = store.engine().clone();
let mut typed = Linker::new(&engine);
typed.func_wrap("", "nop", || {}).unwrap();
typed
.func_wrap("", "nop-params-and-results", |x: i32, y: i64| {
assert_eq!(x, 0);
assert_eq!(y, 0);
0.0f32
})
.unwrap();
let instance = if is_async.use_async() {
run_await(typed.instantiate_async(&mut *store, &module)).unwrap()
} else {
typed.instantiate(&mut *store, &module).unwrap()
};
bench_instance(group, store, &instance, "typed", is_async);
let mut untyped = Linker::new(&engine);
untyped
.func_new("", "nop", FuncType::new([], []), |_, _, _| Ok(()))
.unwrap();
let ty = FuncType::new([ValType::I32, ValType::I64], [ValType::F32]);
untyped
.func_new(
"",
"nop-params-and-results",
ty,
|_caller, params, results| {
assert_eq!(params.len(), 2);
match params[0] {
Val::I32(0) => {}
_ => unreachable!(),
}
match params[1] {
Val::I64(0) => {}
_ => unreachable!(),
}
assert_eq!(results.len(), 1);
results[0] = Val::F32(0);
Ok(())
},
)
.unwrap();
let instance = if is_async.use_async() {
run_await(untyped.instantiate_async(&mut *store, &module)).unwrap()
} else {
untyped.instantiate(&mut *store, &module).unwrap()
};
bench_instance(group, store, &instance, "untyped", is_async);
unsafe {
let mut unchecked = Linker::new(&engine);
unchecked
.func_new_unchecked("", "nop", FuncType::new([], []), |_, _| Ok(()))
.unwrap();
let ty = FuncType::new([ValType::I32, ValType::I64], [ValType::F32]);
unchecked
.func_new_unchecked("", "nop-params-and-results", ty, |mut caller, space| {
match Val::from_raw(&mut caller, *space, ValType::I32) {
Val::I32(0) => {}
_ => unreachable!(),
}
match Val::from_raw(&mut caller, *space.add(1), ValType::I64) {
Val::I64(0) => {}
_ => unreachable!(),
}
*space = Val::F32(0).to_raw(&mut caller);
Ok(())
})
.unwrap();
let instance = if is_async.use_async() {
run_await(unchecked.instantiate_async(&mut *store, &module)).unwrap()
} else {
unchecked.instantiate(&mut *store, &module).unwrap()
};
bench_instance(group, store, &instance, "unchecked", is_async);
}
// Only define async host imports if allowed
if !is_async.use_async() {
return;
}
let mut typed = Linker::new(&engine);
typed
.func_wrap0_async("", "nop", |caller| {
Box::new(async {
drop(caller);
})
})
.unwrap();
typed
.func_wrap2_async("", "nop-params-and-results", |_caller, x: i32, y: i64| {
Box::new(async move {
assert_eq!(x, 0);
assert_eq!(y, 0);
0.0f32
})
})
.unwrap();
let instance = run_await(typed.instantiate_async(&mut *store, &module)).unwrap();
bench_instance(group, store, &instance, "async-typed", is_async);
}
// Given a specific instance executes all of the "runner functions"
fn bench_instance(
group: &mut BenchmarkGroup<'_, WallTime>,
store: &mut Store<()>,
instance: &Instance,
desc: &str,
is_async: IsAsync,
) {
group.bench_function(&format!("wasm-to-host - nop - {}", desc), |b| {
let run = instance
.get_typed_func::<u64, (), _>(&mut *store, "run-nop")
.unwrap();
b.iter_custom(|iters| {
let start = Instant::now();
if is_async.use_async() {
run_await(run.call_async(&mut *store, iters)).unwrap();
} else {
run.call(&mut *store, iters).unwrap();
}
start.elapsed()
})
});
group.bench_function(
&format!("wasm-to-host - nop-params-and-results - {}", desc),
|b| {
let run = instance
.get_typed_func::<u64, (), _>(&mut *store, "run-nop-params-and-results")
.unwrap();
b.iter_custom(|iters| {
let start = Instant::now();
if is_async.use_async() {
run_await(run.call_async(&mut *store, iters)).unwrap();
} else {
run.call(&mut *store, iters).unwrap();
}
start.elapsed()
})
},
);
}
}
fn assert_vals_eq(a: &Val, b: &Val) {
match (a, b) {
(Val::I32(a), Val::I32(b)) => assert_eq!(a, b),
(Val::I64(a), Val::I64(b)) => assert_eq!(a, b),
(Val::F32(a), Val::F32(b)) => assert_eq!(a, b),
(Val::F64(a), Val::F64(b)) => assert_eq!(a, b),
_ => unimplemented!(),
}
}
trait ToVals {
fn to_vals(&self) -> Vec<Val>;
}
macro_rules! tuples {
($($t:ident)*) => (
#[allow(non_snake_case)]
impl<$($t:Copy + Into<Val>,)*> ToVals for ($($t,)*) {
fn to_vals(&self) -> Vec<Val> {
let mut _dst = Vec::new();
let ($($t,)*) = *self;
$(_dst.push($t.into());)*
_dst
}
}
)
}
tuples!();
tuples!(A);
tuples!(A B);
tuples!(A B C);
fn run_await<F: Future>(future: F) -> F::Output {
let mut f = Pin::from(Box::new(future));
let waker = dummy_waker();
let mut cx = Context::from_waker(&waker);
loop {
match f.as_mut().poll(&mut cx) {
Poll::Ready(val) => break val,
Poll::Pending => {}
}
}
}
fn dummy_waker() -> Waker {
return unsafe { Waker::from_raw(clone(5 as *const _)) };
unsafe fn clone(ptr: *const ()) -> RawWaker {
assert_eq!(ptr as usize, 5);
const VTABLE: RawWakerVTable = RawWakerVTable::new(clone, wake, wake_by_ref, drop);
RawWaker::new(ptr, &VTABLE)
}
unsafe fn wake(ptr: *const ()) {
assert_eq!(ptr as usize, 5);
}
unsafe fn wake_by_ref(ptr: *const ()) {
assert_eq!(ptr as usize, 5);
}
unsafe fn drop(ptr: *const ()) {
assert_eq!(ptr as usize, 5);
}
}
Loading…
Cancel
Save