|
|
|
use anyhow::Result;
|
|
|
|
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
|
|
|
|
use once_cell::unsync::Lazy;
|
|
|
|
use std::path::Path;
|
|
|
|
use std::process::Command;
|
|
|
|
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering::SeqCst};
|
|
|
|
use std::sync::Arc;
|
|
|
|
use std::thread;
|
|
|
|
use wasmtime::*;
|
|
|
|
use wasmtime_wasi::{sync::WasiCtxBuilder, WasiCtx};
|
|
|
|
|
|
|
|
fn store(engine: &Engine) -> Store<WasiCtx> {
|
|
|
|
let wasi = WasiCtxBuilder::new().build();
|
|
|
|
Store::new(engine, wasi)
|
|
|
|
}
|
|
|
|
|
|
|
|
fn instantiate(pre: &InstancePre<WasiCtx>, engine: &Engine) -> Result<()> {
|
|
|
|
let mut store = store(engine);
|
|
|
|
let _instance = pre.instantiate(&mut store)?;
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
|
|
|
fn benchmark_name<'a>(strategy: &InstanceAllocationStrategy) -> &'static str {
|
|
|
|
match strategy {
|
|
|
|
InstanceAllocationStrategy::OnDemand => "default",
|
|
|
|
InstanceAllocationStrategy::Pooling { .. } => "pooling",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn bench_sequential(c: &mut Criterion, path: &Path) {
|
|
|
|
let mut group = c.benchmark_group("sequential");
|
|
|
|
|
|
|
|
for strategy in strategies() {
|
|
|
|
let id = BenchmarkId::new(
|
|
|
|
benchmark_name(&strategy),
|
|
|
|
path.file_name().unwrap().to_str().unwrap(),
|
|
|
|
);
|
|
|
|
let state = Lazy::new(|| {
|
|
|
|
let mut config = Config::default();
|
|
|
|
config.allocation_strategy(strategy.clone());
|
|
|
|
|
|
|
|
let engine = Engine::new(&config).expect("failed to create engine");
|
|
|
|
let module = Module::from_file(&engine, path).unwrap_or_else(|e| {
|
|
|
|
panic!("failed to load benchmark `{}`: {:?}", path.display(), e)
|
|
|
|
});
|
|
|
|
let mut linker = Linker::new(&engine);
|
wasmtime: Overhaul trampolines (#6262)
This commit splits `VMCallerCheckedFuncRef::func_ptr` into three new function
pointers: `VMCallerCheckedFuncRef::{wasm,array,native}_call`. Each one has a
dedicated calling convention, so callers just choose the version that works for
them. This is as opposed to the previous behavior where we would chain together
many trampolines that converted between calling conventions, sometimes up to
four on the way into Wasm and four more on the way back out. See [0] for
details.
[0] https://github.com/bytecodealliance/rfcs/blob/main/accepted/tail-calls.md#a-review-of-our-existing-trampolines-calling-conventions-and-call-paths
Thanks to @bjorn3 for the initial idea of having multiple function pointers for
different calling conventions.
This is generally a nice ~5-10% speed up to our call benchmarks across the
board: both Wasm-to-host and host-to-Wasm. The one exception is typed calls from
Wasm to the host, which have a minor regression. We hypothesize that this is
because the old hand-written assembly trampolines did not maintain a call frame
and do a tail call, but the new Cranelift-generated trampolines do maintain a
call frame and do a regular call. The regression is only a couple nanoseconds,
which seems well-explained by these differences explain, and ultimately is not a
big deal.
However, this does lead to a ~5% code size regression for compiled modules.
Before, we compiled a trampoline per escaping function's signature and we
deduplicated these trampolines by signature. Now we compile two trampolines per
escaping function: one for if the host calls via the array calling convention
and one for it the host calls via the native calling convention. Additionally,
we compile a trampoline for every type in the module, in case there is a native
calling convention function from the host that we `call_indirect` of that
type. Much of this is in the `.eh_frame` section in the compiled module, because
each of our trampolines needs an entry there. Note that the `.eh_frame` section
is not required for Wasmtime's correctness, and you can disable its generation
to shrink compiled module code size; we just emit it to play nice with external
unwinders and profilers. We believe there are code size gains available for
follow up work to offset this code size regression in the future.
Backing up a bit: the reason each Wasm module needs to provide these
Wasm-to-native trampolines is because `wasmtime::Func::wrap` and friends allow
embedders to create functions even when there is no compiler available, so they
cannot bring their own trampoline. Instead the Wasm module has to supply
it. This in turn means that we need to look up and patch in these Wasm-to-native
trampolines during roughly instantiation time. But instantiation is super hot,
and we don't want to add more passes over imports or any extra work on this
path. So we integrate with `wasmtime::InstancePre` to patch these trampolines in
ahead of time.
Co-Authored-By: Jamey Sharp <jsharp@fastly.com>
Co-Authored-By: Alex Crichton <alex@alexcrichton.com>
prtest:full
2 years ago
|
|
|
// Add these imports so we can benchmark instantiation of Sightglass
|
|
|
|
// benchmark programs.
|
|
|
|
linker.func_wrap("bench", "start", || {}).unwrap();
|
|
|
|
linker.func_wrap("bench", "end", || {}).unwrap();
|
|
|
|
wasmtime_wasi::add_to_linker(&mut linker, |cx| cx).unwrap();
|
|
|
|
let pre = linker
|
Remove the need to have a `Store` for an `InstancePre` (#5683)
* Remove the need to have a `Store` for an `InstancePre`
This commit relaxes a requirement of the `InstancePre` API, notably its
construction via `Linker::instantiate_pre`. Previously this function
required a `Store<T>` to be present to be able to perform type-checking
on the contents of the linker, and now this requirement has been
removed.
Items stored within a linker are either a `HostFunc`, which has type
information inside of it, or an `Extern`, which doesn't have type
information inside of it. Due to the usage of `Extern` this is why a
`Store` was required during the `InstancePre` construction process, it's
used to extract the type of an `Extern`. This commit implements a
solution where the type information of an `Extern` is stored alongside
the `Extern` itself, meaning that the `InstancePre` construction process
no longer requires a `Store<T>`.
One caveat of this implementation is that some items, such as tables and
memories, technically have a "dynamic type" where during type checking
their current size is consulted to match against the minimum size
required of an import. This no longer works when using
`Linker::instantiate_pre` as the current size used is the one when it
was inserted into the linker rather than the one available at
instantiation time. It's hoped, however, that this is a relatively
esoteric use case that doesn't impact many real-world users.
Additionally note that this is an API-breaking change. Not only is the
`Store` argument removed from `Linker::instantiate_pre`, but some other
methods such as `Linker::define` grew a `Store` argument as the type
needs to be extracted when an item is inserted into a linker.
Closes #5675
* Fix the C API
* Fix benchmark compilation
* Add C API docs
* Update crates/wasmtime/src/linker.rs
Co-authored-by: Andrew Brown <andrew.brown@intel.com>
---------
Co-authored-by: Andrew Brown <andrew.brown@intel.com>
2 years ago
|
|
|
.instantiate_pre(&module)
|
|
|
|
.expect("failed to pre-instantiate");
|
|
|
|
(engine, pre)
|
|
|
|
});
|
|
|
|
|
|
|
|
group.bench_function(id, |b| {
|
|
|
|
let (engine, pre) = &*state;
|
|
|
|
b.iter(|| {
|
|
|
|
instantiate(&pre, &engine).expect("failed to instantiate module");
|
|
|
|
});
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
group.finish();
|
|
|
|
}
|
|
|
|
|
|
|
|
fn bench_parallel(c: &mut Criterion, path: &Path) {
|
|
|
|
let mut group = c.benchmark_group("parallel");
|
|
|
|
|
|
|
|
for strategy in strategies() {
|
|
|
|
let state = Lazy::new(|| {
|
|
|
|
let mut config = Config::default();
|
|
|
|
config.allocation_strategy(strategy.clone());
|
|
|
|
|
|
|
|
let engine = Engine::new(&config).expect("failed to create engine");
|
|
|
|
let module =
|
|
|
|
Module::from_file(&engine, path).expect("failed to load WASI example module");
|
|
|
|
let mut linker = Linker::new(&engine);
|
wasmtime: Overhaul trampolines (#6262)
This commit splits `VMCallerCheckedFuncRef::func_ptr` into three new function
pointers: `VMCallerCheckedFuncRef::{wasm,array,native}_call`. Each one has a
dedicated calling convention, so callers just choose the version that works for
them. This is as opposed to the previous behavior where we would chain together
many trampolines that converted between calling conventions, sometimes up to
four on the way into Wasm and four more on the way back out. See [0] for
details.
[0] https://github.com/bytecodealliance/rfcs/blob/main/accepted/tail-calls.md#a-review-of-our-existing-trampolines-calling-conventions-and-call-paths
Thanks to @bjorn3 for the initial idea of having multiple function pointers for
different calling conventions.
This is generally a nice ~5-10% speed up to our call benchmarks across the
board: both Wasm-to-host and host-to-Wasm. The one exception is typed calls from
Wasm to the host, which have a minor regression. We hypothesize that this is
because the old hand-written assembly trampolines did not maintain a call frame
and do a tail call, but the new Cranelift-generated trampolines do maintain a
call frame and do a regular call. The regression is only a couple nanoseconds,
which seems well-explained by these differences explain, and ultimately is not a
big deal.
However, this does lead to a ~5% code size regression for compiled modules.
Before, we compiled a trampoline per escaping function's signature and we
deduplicated these trampolines by signature. Now we compile two trampolines per
escaping function: one for if the host calls via the array calling convention
and one for it the host calls via the native calling convention. Additionally,
we compile a trampoline for every type in the module, in case there is a native
calling convention function from the host that we `call_indirect` of that
type. Much of this is in the `.eh_frame` section in the compiled module, because
each of our trampolines needs an entry there. Note that the `.eh_frame` section
is not required for Wasmtime's correctness, and you can disable its generation
to shrink compiled module code size; we just emit it to play nice with external
unwinders and profilers. We believe there are code size gains available for
follow up work to offset this code size regression in the future.
Backing up a bit: the reason each Wasm module needs to provide these
Wasm-to-native trampolines is because `wasmtime::Func::wrap` and friends allow
embedders to create functions even when there is no compiler available, so they
cannot bring their own trampoline. Instead the Wasm module has to supply
it. This in turn means that we need to look up and patch in these Wasm-to-native
trampolines during roughly instantiation time. But instantiation is super hot,
and we don't want to add more passes over imports or any extra work on this
path. So we integrate with `wasmtime::InstancePre` to patch these trampolines in
ahead of time.
Co-Authored-By: Jamey Sharp <jsharp@fastly.com>
Co-Authored-By: Alex Crichton <alex@alexcrichton.com>
prtest:full
2 years ago
|
|
|
// Add these imports so we can benchmark instantiation of Sightglass
|
|
|
|
// benchmark programs.
|
|
|
|
linker.func_wrap("bench", "start", || {}).unwrap();
|
|
|
|
linker.func_wrap("bench", "end", || {}).unwrap();
|
|
|
|
wasmtime_wasi::add_to_linker(&mut linker, |cx| cx).unwrap();
|
|
|
|
let pre = Arc::new(
|
|
|
|
linker
|
Remove the need to have a `Store` for an `InstancePre` (#5683)
* Remove the need to have a `Store` for an `InstancePre`
This commit relaxes a requirement of the `InstancePre` API, notably its
construction via `Linker::instantiate_pre`. Previously this function
required a `Store<T>` to be present to be able to perform type-checking
on the contents of the linker, and now this requirement has been
removed.
Items stored within a linker are either a `HostFunc`, which has type
information inside of it, or an `Extern`, which doesn't have type
information inside of it. Due to the usage of `Extern` this is why a
`Store` was required during the `InstancePre` construction process, it's
used to extract the type of an `Extern`. This commit implements a
solution where the type information of an `Extern` is stored alongside
the `Extern` itself, meaning that the `InstancePre` construction process
no longer requires a `Store<T>`.
One caveat of this implementation is that some items, such as tables and
memories, technically have a "dynamic type" where during type checking
their current size is consulted to match against the minimum size
required of an import. This no longer works when using
`Linker::instantiate_pre` as the current size used is the one when it
was inserted into the linker rather than the one available at
instantiation time. It's hoped, however, that this is a relatively
esoteric use case that doesn't impact many real-world users.
Additionally note that this is an API-breaking change. Not only is the
`Store` argument removed from `Linker::instantiate_pre`, but some other
methods such as `Linker::define` grew a `Store` argument as the type
needs to be extracted when an item is inserted into a linker.
Closes #5675
* Fix the C API
* Fix benchmark compilation
* Add C API docs
* Update crates/wasmtime/src/linker.rs
Co-authored-by: Andrew Brown <andrew.brown@intel.com>
---------
Co-authored-by: Andrew Brown <andrew.brown@intel.com>
2 years ago
|
|
|
.instantiate_pre(&module)
|
|
|
|
.expect("failed to pre-instantiate"),
|
|
|
|
);
|
|
|
|
(engine, pre)
|
|
|
|
});
|
|
|
|
|
|
|
|
for threads in 1..=num_cpus::get_physical().min(16) {
|
|
|
|
let name = format!(
|
|
|
|
"{}: with {} thread{}",
|
|
|
|
path.file_name().unwrap().to_str().unwrap(),
|
|
|
|
threads,
|
|
|
|
if threads == 1 { "" } else { "s" }
|
|
|
|
);
|
|
|
|
let id = BenchmarkId::new(benchmark_name(&strategy), name);
|
|
|
|
group.bench_function(id, |b| {
|
|
|
|
let (engine, pre) = &*state;
|
|
|
|
// Spin up N-1 threads doing background instantiations to
|
|
|
|
// simulate concurrent instantiations.
|
|
|
|
let done = Arc::new(AtomicBool::new(false));
|
|
|
|
let count = Arc::new(AtomicUsize::new(0));
|
|
|
|
let workers = (0..threads - 1)
|
|
|
|
.map(|_| {
|
|
|
|
let pre = pre.clone();
|
|
|
|
let done = done.clone();
|
|
|
|
let engine = engine.clone();
|
|
|
|
let count = count.clone();
|
|
|
|
thread::spawn(move || {
|
|
|
|
count.fetch_add(1, SeqCst);
|
|
|
|
while !done.load(SeqCst) {
|
|
|
|
instantiate(&pre, &engine).unwrap();
|
|
|
|
}
|
|
|
|
})
|
|
|
|
})
|
|
|
|
.collect::<Vec<_>>();
|
|
|
|
|
|
|
|
// Wait for our workers to all get started and have
|
|
|
|
// instantiated their first module, at which point they'll
|
|
|
|
// all be spinning.
|
|
|
|
while count.load(SeqCst) != threads - 1 {
|
|
|
|
thread::yield_now();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now that our background work is configured we can
|
|
|
|
// benchmark the amount of time it takes to instantiate this
|
|
|
|
// module.
|
|
|
|
b.iter(|| {
|
|
|
|
instantiate(&pre, &engine).expect("failed to instantiate module");
|
|
|
|
});
|
|
|
|
|
|
|
|
// Shut down this benchmark iteration by signalling to
|
|
|
|
// worker threads they should exit and then wait for them to
|
|
|
|
// have reached the exit point.
|
|
|
|
done.store(true, SeqCst);
|
|
|
|
for t in workers {
|
|
|
|
t.join().unwrap();
|
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
group.finish();
|
|
|
|
}
|
|
|
|
|
|
|
|
fn bench_deserialize_module(c: &mut Criterion, path: &Path) {
|
|
|
|
let mut group = c.benchmark_group("deserialize");
|
|
|
|
|
|
|
|
let name = path.file_name().unwrap().to_str().unwrap();
|
|
|
|
let tmpfile = tempfile::NamedTempFile::new().unwrap();
|
|
|
|
let state = Lazy::new(|| {
|
|
|
|
let engine = Engine::default();
|
|
|
|
let module = Module::from_file(&engine, path).expect("failed to load WASI example module");
|
|
|
|
let bytes = module.serialize().unwrap();
|
|
|
|
std::fs::write(tmpfile.path(), bytes.clone()).unwrap();
|
|
|
|
(engine, bytes, tmpfile.path())
|
|
|
|
});
|
|
|
|
group.bench_function(BenchmarkId::new("deserialize", name), |b| {
|
|
|
|
let (engine, bytes, _) = &*state;
|
|
|
|
b.iter(|| unsafe {
|
|
|
|
Module::deserialize(&engine, bytes).unwrap();
|
|
|
|
});
|
|
|
|
});
|
|
|
|
group.bench_function(BenchmarkId::new("deserialize_file", name), |b| {
|
|
|
|
let (engine, _, path) = &*state;
|
|
|
|
b.iter(|| unsafe {
|
|
|
|
Module::deserialize_file(&engine, path).unwrap();
|
|
|
|
});
|
|
|
|
});
|
|
|
|
|
|
|
|
group.finish();
|
|
|
|
}
|
|
|
|
|
|
|
|
fn build_wasi_example() {
|
|
|
|
println!("Building WASI example module...");
|
|
|
|
if !Command::new("cargo")
|
|
|
|
.args(&[
|
|
|
|
"build",
|
|
|
|
"--release",
|
|
|
|
"-p",
|
|
|
|
"example-wasi-wasm",
|
|
|
|
"--target",
|
|
|
|
"wasm32-wasi",
|
|
|
|
])
|
|
|
|
.spawn()
|
|
|
|
.expect("failed to run cargo to build WASI example")
|
|
|
|
.wait()
|
|
|
|
.expect("failed to wait for cargo to build")
|
|
|
|
.success()
|
|
|
|
{
|
|
|
|
panic!("failed to build WASI example for target `wasm32-wasi`");
|
|
|
|
}
|
|
|
|
|
|
|
|
std::fs::copy(
|
|
|
|
"target/wasm32-wasi/release/wasi.wasm",
|
|
|
|
"benches/instantiation/wasi.wasm",
|
|
|
|
)
|
|
|
|
.expect("failed to copy WASI example module");
|
|
|
|
}
|
|
|
|
|
|
|
|
fn bench_instantiation(c: &mut Criterion) {
|
|
|
|
build_wasi_example();
|
|
|
|
|
|
|
|
for file in std::fs::read_dir("benches/instantiation").unwrap() {
|
|
|
|
let path = file.unwrap().path();
|
|
|
|
bench_sequential(c, &path);
|
|
|
|
bench_parallel(c, &path);
|
|
|
|
bench_deserialize_module(c, &path);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn strategies() -> impl Iterator<Item = InstanceAllocationStrategy> {
|
|
|
|
[
|
|
|
|
InstanceAllocationStrategy::OnDemand,
|
|
|
|
InstanceAllocationStrategy::Pooling({
|
|
|
|
let mut config = PoolingAllocationConfig::default();
|
Wasmtime: refactor the pooling allocator for components (#6835)
* Wasmtime: Rename `IndexAllocator` to `ModuleAffinityIndexAllocator`
We will have multiple kinds of index allocators soon, so clarify which one this
is.
* Wasmtime: Introduce a simple index allocator
This will be used in future commits refactoring the pooling allocator.
* Wasmtime: refactor the pooling allocator for components
We used to have one index allocator, an index per instance, and give out N
tables and M memories to every instance regardless how many tables and memories
they need.
Now we have an index allocator for memories and another for tables. An instance
isn't associated with a single instance, each of its memories and tables have an
index. We allocate exactly as many tables and memories as the instance actually
needs.
Ultimately, this gives us better component support, where a component instance
might have varying numbers of internal tables and memories.
Additionally, you can now limit the number of tables, memories, and core
instances a single component can allocate from the pooling allocator, even if
there is the capacity for that many available. This is to give embedders tools
to limit individual component instances and prevent them from hogging too much
of the pooling allocator's resources.
* Remove unused file
Messed up from rebasing, this code is actually just inline in the index
allocator module.
* Address review feedback
* Fix benchmarks build
* Fix ignoring test under miri
The `async_functions` module is not even compiled-but-ignored with miri, it is
completely `cfg`ed off. Therefore we ahve to do the same with this test that
imports stuff from that module.
* Fix doc links
* Allow testing utilities to be unused
The exact `cfg`s that unlock the tests that use these are platform and feature
dependent and ends up being like 5 things and super long. Simpler to just allow
unused for when we are testing on other platforms or don't have the compile time
features enabled.
* Debug assert that the pool is empty on drop, per Alex's suggestion
Also fix a couple scenarios where we could leak indices if allocating an index
for a memory/table succeeded but then creating the memory/table itself failed.
* Fix windows compile errors
1 year ago
|
|
|
config.memory_pages(10_000);
|
|
|
|
config
|
|
|
|
}),
|
|
|
|
]
|
|
|
|
.into_iter()
|
|
|
|
}
|
|
|
|
|
|
|
|
criterion_group!(benches, bench_instantiation);
|
|
|
|
criterion_main!(benches);
|