Add a benchmark for measuring call overhead (#3883)

The goal of this new benchmark, `call`, is to help us measure the overhead of both calling into WebAssembly from the host as well as calling the host from WebAssembly. There's lots of various ways to measure this so this benchmark is a bit large but should hopefully be pretty thorough. It's expected that this benchmark will rarely be run in its entirety but rather only a subset of the benchmarks will be run at any one given time. Some metrics measured here are: * Typed vs Untyped vs Unchecked - testing the cost of both calling wasm with these various methods as well as having wasm call the host where the host function is defined with these various methods. * With and without `call_hook` - helps to measure the overhead of the `Store::call_hook` API. * Synchronous and Asynchronous - measures the overhead of calling into WebAssembly asynchronously (with and without the pooling allocator) in addition to defining host APIs in various methods when wasm is called asynchronously. Currently all the numbers are as expected, notably: * Host calling WebAssembly is ~25ns of overhead * WebAssembly calling the host is ~3ns of overhead * "Unchecked" is a bit slower than "typed", and "Untyped" is slower than unchecked. * Asynchronous wasm calling a synchronous host function has ~3ns of overhead (nothing more than usual). * Asynchronous calls are much slower, on the order of 2-3us due to `madvise`. Lots of other fiddly bits that can be measured here, but this will hopefully help establish a benchmark through which we can measure in the future in addition to measuring changes such as #3876
3 years ago · 8c9c72caaa
2 changed files with 474 additions and 0 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -135,3 +135,7 @@ harness = false
 [[bench]]
 name = "thread_eager_init"
 harness = false
+
+[[bench]]
+name = "call"
+harness = false
--- a/benches/call.rs
+++ b/benches/call.rs
@ -0,0 +1,470 @@
+use criterion::measurement::WallTime;
+use criterion::{criterion_group, criterion_main, BenchmarkGroup, Criterion};
+use std::fmt::Debug;
+use std::future::Future;
+use std::pin::Pin;
+use std::task::{Context, Poll, RawWaker, RawWakerVTable, Waker};
+use std::time::Instant;
+use wasmtime::*;
+
+criterion_main!(benches);
+criterion_group!(benches, measure_execution_time);
+
+fn measure_execution_time(c: &mut Criterion) {
+    host_to_wasm(c);
+    wasm_to_host(c);
+}
+
+#[derive(Copy, Clone)]
+enum IsAsync {
+    Yes,
+    YesPooling,
+    No,
+}
+
+impl IsAsync {
+    fn desc(&self) -> &str {
+        match self {
+            IsAsync::Yes => "async",
+            IsAsync::YesPooling => "async-pool",
+            IsAsync::No => "sync",
+        }
+    }
+    fn use_async(&self) -> bool {
+        match self {
+            IsAsync::Yes | IsAsync::YesPooling => true,
+            IsAsync::No => false,
+        }
+    }
+}
+
+fn engines() -> Vec<(Engine, IsAsync)> {
+    let mut config = Config::new();
+    vec![
+        (Engine::new(&config).unwrap(), IsAsync::No),
+        (
+            Engine::new(config.async_support(true)).unwrap(),
+            IsAsync::Yes,
+        ),
+        (
+            Engine::new(config.allocation_strategy(InstanceAllocationStrategy::pooling())).unwrap(),
+            IsAsync::YesPooling,
+        ),
+    ]
+}
+
+/// Benchmarks the overhead of calling WebAssembly from the host in various
+/// configurations.
+fn host_to_wasm(c: &mut Criterion) {
+    for (engine, is_async) in engines() {
+        let mut store = Store::new(&engine, ());
+        let module = Module::new(
+            &engine,
+            r#"(module
+                (func (export "nop"))
+                (func (export "nop-params-and-results") (param i32 i64) (result f32)
+                    f32.const 0)
+            )"#,
+        )
+        .unwrap();
+        let instance = if is_async.use_async() {
+            run_await(Instance::new_async(&mut store, &module, &[])).unwrap()
+        } else {
+            Instance::new(&mut store, &module, &[]).unwrap()
+        };
+
+        let bench_calls = |group: &mut BenchmarkGroup<'_, WallTime>, store: &mut Store<()>| {
+            // Bench the overhead of a function that has no parameters or results
+            bench_host_to_wasm::<(), ()>(group, store, &instance, is_async, "nop", (), ());
+            // Bench the overhead of a function that has some parameters and just
+            // one result (will use the raw system-v convention on applicable
+            // platforms).
+            bench_host_to_wasm::<(i32, i64), (f32,)>(
+                group,
+                store,
+                &instance,
+                is_async,
+                "nop-params-and-results",
+                (0, 0),
+                (0.0,),
+            );
+        };
+
+        // Bench once without any call hooks configured
+        let name = format!("{}/no-hook", is_async.desc());
+        bench_calls(&mut c.benchmark_group(&name), &mut store);
+
+        // Bench again with a "call hook" enabled
+        store.call_hook(|_, _| Ok(()));
+        let name = format!("{}/hook-sync", is_async.desc());
+        bench_calls(&mut c.benchmark_group(&name), &mut store);
+    }
+}
+
+fn bench_host_to_wasm<Params, Results>(
+    c: &mut BenchmarkGroup<'_, WallTime>,
+    store: &mut Store<()>,
+    instance: &Instance,
+    is_async: IsAsync,
+    name: &str,
+    typed_params: Params,
+    typed_results: Results,
+) where
+    Params: WasmParams + ToVals + Copy,
+    Results: WasmResults + ToVals + Copy + PartialEq + Debug,
+{
+    // Benchmark the "typed" version, which should be faster than the versions
+    // below.
+    c.bench_function(&format!("host-to-wasm - typed - {}", name), |b| {
+        let typed = instance
+            .get_typed_func::<Params, Results, _>(&mut *store, name)
+            .unwrap();
+        b.iter(|| {
+            let results = if is_async.use_async() {
+                run_await(typed.call_async(&mut *store, typed_params)).unwrap()
+            } else {
+                typed.call(&mut *store, typed_params).unwrap()
+            };
+            assert_eq!(results, typed_results);
+        })
+    });
+
+    // Benchmark the "untyped" version which should be the slowest of the three
+    // here, but not unduly slow.
+    c.bench_function(&format!("host-to-wasm - untyped - {}", name), |b| {
+        let untyped = instance.get_func(&mut *store, name).unwrap();
+        let params = typed_params.to_vals();
+        let expected_results = typed_results.to_vals();
+        let mut results = vec![Val::I32(0); expected_results.len()];
+        b.iter(|| {
+            if is_async.use_async() {
+                run_await(untyped.call_async(&mut *store, &params, &mut results)).unwrap();
+            } else {
+                untyped.call(&mut *store, &params, &mut results).unwrap();
+            }
+            for (expected, actual) in expected_results.iter().zip(&results) {
+                assert_vals_eq(expected, actual);
+            }
+        })
+    });
+
+    // Currently `call_async_unchecked` isn't implemented, so can't benchmark
+    // below
+    if is_async.use_async() {
+        return;
+    }
+
+    // Benchmark the "unchecked" version which should be between the above two,
+    // but is unsafe.
+    c.bench_function(&format!("host-to-wasm - unchecked - {}", name), |b| {
+        let untyped = instance.get_func(&mut *store, name).unwrap();
+        let params = typed_params.to_vals();
+        let results = typed_results.to_vals();
+        let mut space = vec![ValRaw { i32: 0 }; params.len().max(results.len())];
+        b.iter(|| unsafe {
+            for (i, param) in params.iter().enumerate() {
+                space[i] = param.to_raw(&mut *store);
+            }
+            untyped
+                .call_unchecked(&mut *store, space.as_mut_ptr())
+                .unwrap();
+            for (i, expected) in results.iter().enumerate() {
+                assert_vals_eq(
+                    expected,
+                    &Val::from_raw(&mut *store, space[i], expected.ty()),
+                );
+            }
+        })
+    });
+}
+
+/// Benchmarks the overhead of calling the host from WebAssembly itself
+fn wasm_to_host(c: &mut Criterion) {
+    let module = r#"(module
+        ;; host imports with a variety of parameters/arguments
+        (import "" "nop" (func $nop))
+        (import "" "nop-params-and-results"
+            (func $nop_params_and_results (param i32 i64) (result f32))
+        )
+
+        ;; "runner functions" for each of the above imports. Each runner
+        ;; function takes the number of times to call the host function as
+        ;; the duration of this entire loop will be measured.
+
+        (func (export "run-nop") (param i64)
+            loop
+                call $nop
+
+                local.get 0             ;; decrement & break if necessary
+                i64.const -1
+                i64.add
+                local.tee 0
+                i64.const 0
+                i64.ne
+                br_if 0
+            end
+        )
+
+        (func (export "run-nop-params-and-results") (param i64)
+            loop
+                i32.const 0             ;; always zero parameters
+                i64.const 0
+                call $nop_params_and_results
+                f32.const 0             ;; assert the correct result
+                f32.eq
+                i32.eqz
+                if
+                    unreachable
+                end
+
+                local.get 0             ;; decrement & break if necessary
+                i64.const -1
+                i64.add
+                local.tee 0
+                i64.const 0
+                i64.ne
+                br_if 0
+            end
+        )
+
+    )"#;
+
+    for (engine, is_async) in engines() {
+        let mut store = Store::new(&engine, ());
+        let module = Module::new(&engine, module).unwrap();
+
+        bench_calls(
+            &mut c.benchmark_group(&format!("{}/no-hook", is_async.desc())),
+            &mut store,
+            &module,
+            is_async,
+        );
+        store.call_hook(|_, _| Ok(()));
+        bench_calls(
+            &mut c.benchmark_group(&format!("{}/hook-sync", is_async.desc())),
+            &mut store,
+            &module,
+            is_async,
+        );
+    }
+
+    // Given a `Store` will create various instances hooked up to different ways
+    // of defining host imports to benchmark their overhead.
+    fn bench_calls(
+        group: &mut BenchmarkGroup<'_, WallTime>,
+        store: &mut Store<()>,
+        module: &Module,
+        is_async: IsAsync,
+    ) {
+        let engine = store.engine().clone();
+        let mut typed = Linker::new(&engine);
+        typed.func_wrap("", "nop", || {}).unwrap();
+        typed
+            .func_wrap("", "nop-params-and-results", |x: i32, y: i64| {
+                assert_eq!(x, 0);
+                assert_eq!(y, 0);
+                0.0f32
+            })
+            .unwrap();
+        let instance = if is_async.use_async() {
+            run_await(typed.instantiate_async(&mut *store, &module)).unwrap()
+        } else {
+            typed.instantiate(&mut *store, &module).unwrap()
+        };
+        bench_instance(group, store, &instance, "typed", is_async);
+
+        let mut untyped = Linker::new(&engine);
+        untyped
+            .func_new("", "nop", FuncType::new([], []), |_, _, _| Ok(()))
+            .unwrap();
+        let ty = FuncType::new([ValType::I32, ValType::I64], [ValType::F32]);
+        untyped
+            .func_new(
+                "",
+                "nop-params-and-results",
+                ty,
+                |_caller, params, results| {
+                    assert_eq!(params.len(), 2);
+                    match params[0] {
+                        Val::I32(0) => {}
+                        _ => unreachable!(),
+                    }
+                    match params[1] {
+                        Val::I64(0) => {}
+                        _ => unreachable!(),
+                    }
+                    assert_eq!(results.len(), 1);
+                    results[0] = Val::F32(0);
+                    Ok(())
+                },
+            )
+            .unwrap();
+        let instance = if is_async.use_async() {
+            run_await(untyped.instantiate_async(&mut *store, &module)).unwrap()
+        } else {
+            untyped.instantiate(&mut *store, &module).unwrap()
+        };
+        bench_instance(group, store, &instance, "untyped", is_async);
+
+        unsafe {
+            let mut unchecked = Linker::new(&engine);
+            unchecked
+                .func_new_unchecked("", "nop", FuncType::new([], []), |_, _| Ok(()))
+                .unwrap();
+            let ty = FuncType::new([ValType::I32, ValType::I64], [ValType::F32]);
+            unchecked
+                .func_new_unchecked("", "nop-params-and-results", ty, |mut caller, space| {
+                    match Val::from_raw(&mut caller, *space, ValType::I32) {
+                        Val::I32(0) => {}
+                        _ => unreachable!(),
+                    }
+                    match Val::from_raw(&mut caller, *space.add(1), ValType::I64) {
+                        Val::I64(0) => {}
+                        _ => unreachable!(),
+                    }
+                    *space = Val::F32(0).to_raw(&mut caller);
+                    Ok(())
+                })
+                .unwrap();
+            let instance = if is_async.use_async() {
+                run_await(unchecked.instantiate_async(&mut *store, &module)).unwrap()
+            } else {
+                unchecked.instantiate(&mut *store, &module).unwrap()
+            };
+            bench_instance(group, store, &instance, "unchecked", is_async);
+        }
+
+        // Only define async host imports if allowed
+        if !is_async.use_async() {
+            return;
+        }
+
+        let mut typed = Linker::new(&engine);
+        typed
+            .func_wrap0_async("", "nop", |caller| {
+                Box::new(async {
+                    drop(caller);
+                })
+            })
+            .unwrap();
+        typed
+            .func_wrap2_async("", "nop-params-and-results", |_caller, x: i32, y: i64| {
+                Box::new(async move {
+                    assert_eq!(x, 0);
+                    assert_eq!(y, 0);
+                    0.0f32
+                })
+            })
+            .unwrap();
+        let instance = run_await(typed.instantiate_async(&mut *store, &module)).unwrap();
+        bench_instance(group, store, &instance, "async-typed", is_async);
+    }
+
+    // Given a specific instance executes all of the "runner functions"
+    fn bench_instance(
+        group: &mut BenchmarkGroup<'_, WallTime>,
+        store: &mut Store<()>,
+        instance: &Instance,
+        desc: &str,
+        is_async: IsAsync,
+    ) {
+        group.bench_function(&format!("wasm-to-host - nop - {}", desc), |b| {
+            let run = instance
+                .get_typed_func::<u64, (), _>(&mut *store, "run-nop")
+                .unwrap();
+            b.iter_custom(|iters| {
+                let start = Instant::now();
+                if is_async.use_async() {
+                    run_await(run.call_async(&mut *store, iters)).unwrap();
+                } else {
+                    run.call(&mut *store, iters).unwrap();
+                }
+                start.elapsed()
+            })
+        });
+        group.bench_function(
+            &format!("wasm-to-host - nop-params-and-results - {}", desc),
+            |b| {
+                let run = instance
+                    .get_typed_func::<u64, (), _>(&mut *store, "run-nop-params-and-results")
+                    .unwrap();
+                b.iter_custom(|iters| {
+                    let start = Instant::now();
+                    if is_async.use_async() {
+                        run_await(run.call_async(&mut *store, iters)).unwrap();
+                    } else {
+                        run.call(&mut *store, iters).unwrap();
+                    }
+                    start.elapsed()
+                })
+            },
+        );
+    }
+}
+
+fn assert_vals_eq(a: &Val, b: &Val) {
+    match (a, b) {
+        (Val::I32(a), Val::I32(b)) => assert_eq!(a, b),
+        (Val::I64(a), Val::I64(b)) => assert_eq!(a, b),
+        (Val::F32(a), Val::F32(b)) => assert_eq!(a, b),
+        (Val::F64(a), Val::F64(b)) => assert_eq!(a, b),
+        _ => unimplemented!(),
+    }
+}
+
+trait ToVals {
+    fn to_vals(&self) -> Vec<Val>;
+}
+
+macro_rules! tuples {
+    ($($t:ident)*) => (
+        #[allow(non_snake_case)]
+        impl<$($t:Copy + Into<Val>,)*> ToVals for ($($t,)*) {
+            fn to_vals(&self) -> Vec<Val> {
+                let mut _dst = Vec::new();
+                let ($($t,)*) = *self;
+                $(_dst.push($t.into());)*
+                _dst
+            }
+        }
+    )
+}
+
+tuples!();
+tuples!(A);
+tuples!(A B);
+tuples!(A B C);
+
+fn run_await<F: Future>(future: F) -> F::Output {
+    let mut f = Pin::from(Box::new(future));
+    let waker = dummy_waker();
+    let mut cx = Context::from_waker(&waker);
+    loop {
+        match f.as_mut().poll(&mut cx) {
+            Poll::Ready(val) => break val,
+            Poll::Pending => {}
+        }
+    }
+}
+
+fn dummy_waker() -> Waker {
+    return unsafe { Waker::from_raw(clone(5 as *const _)) };
+
+    unsafe fn clone(ptr: *const ()) -> RawWaker {
+        assert_eq!(ptr as usize, 5);
+        const VTABLE: RawWakerVTable = RawWakerVTable::new(clone, wake, wake_by_ref, drop);
+        RawWaker::new(ptr, &VTABLE)
+    }
+
+    unsafe fn wake(ptr: *const ()) {
+        assert_eq!(ptr as usize, 5);
+    }
+
+    unsafe fn wake_by_ref(ptr: *const ()) {
+        assert_eq!(ptr as usize, 5);
+    }
+
+    unsafe fn drop(ptr: *const ()) {
+        assert_eq!(ptr as usize, 5);
+    }
+}