mirror of https://github.com/hnes/libaco
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
492 lines
17 KiB
492 lines
17 KiB
// Copyright 2018 Sen Han <00hnes@gmail.com>
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#define _GNU_SOURCE
|
|
|
|
#include "aco.h"
|
|
#include <stdio.h>
|
|
#include <stdint.h>
|
|
|
|
// this header including should be at the last of the `include` directives list
|
|
#include "aco_assert_override.h"
|
|
|
|
void aco_runtime_test(void){
|
|
#ifdef __i386__
|
|
_Static_assert(sizeof(void*) == 4, "require 'sizeof(void*) == 4'");
|
|
#elif __x86_64__
|
|
_Static_assert(sizeof(void*) == 8, "require 'sizeof(void*) == 8'");
|
|
_Static_assert(sizeof(__uint128_t) == 16, "require 'sizeof(__uint128_t) == 16'");
|
|
#else
|
|
#error "platform no support yet"
|
|
#endif
|
|
_Static_assert(sizeof(int) >= 4, "require 'sizeof(int) >= 4'");
|
|
assert(sizeof(int) >= 4);
|
|
_Static_assert(sizeof(int) <= sizeof(size_t),
|
|
"require 'sizeof(int) <= sizeof(size_t)'");
|
|
assert(sizeof(int) <= sizeof(size_t));
|
|
}
|
|
|
|
// assertptr(dst); assertptr(src);
|
|
// assert((((uintptr_t)(src) & 0x0f) == 0) && (((uintptr_t)(dst) & 0x0f) == 0));
|
|
// assert((((sz) & 0x0f) == 0x08) && (((sz) >> 4) >= 0) && (((sz) >> 4) <= 8));
|
|
// sz = 16*n + 8 ( 0 <= n <= 8)
|
|
|
|
// Note: dst and src must be valid address already
|
|
#define aco_amd64_inline_short_aligned_memcpy_test_ok(dst, src, sz) \
|
|
( \
|
|
(((uintptr_t)(src) & 0x0f) == 0) && (((uintptr_t)(dst) & 0x0f) == 0) \
|
|
&& \
|
|
(((sz) & 0x0f) == 0x08) && (((sz) >> 4) >= 0) && (((sz) >> 4) <= 8) \
|
|
)
|
|
|
|
#define aco_amd64_inline_short_aligned_memcpy(dst, src, sz) do {\
|
|
__uint128_t xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7; \
|
|
switch((sz) >> 4){ \
|
|
case 0: \
|
|
break; \
|
|
case 1: \
|
|
xmm0 = *((__uint128_t*)(src) + 0); \
|
|
*((__uint128_t*)(dst) + 0) = xmm0; \
|
|
break; \
|
|
case 2: \
|
|
xmm0 = *((__uint128_t*)(src) + 0); \
|
|
xmm1 = *((__uint128_t*)(src) + 1); \
|
|
*((__uint128_t*)(dst) + 0) = xmm0; \
|
|
*((__uint128_t*)(dst) + 1) = xmm1; \
|
|
break; \
|
|
case 3: \
|
|
xmm0 = *((__uint128_t*)(src) + 0); \
|
|
xmm1 = *((__uint128_t*)(src) + 1); \
|
|
xmm2 = *((__uint128_t*)(src) + 2); \
|
|
*((__uint128_t*)(dst) + 0) = xmm0; \
|
|
*((__uint128_t*)(dst) + 1) = xmm1; \
|
|
*((__uint128_t*)(dst) + 2) = xmm2; \
|
|
break; \
|
|
case 4: \
|
|
xmm0 = *((__uint128_t*)(src) + 0); \
|
|
xmm1 = *((__uint128_t*)(src) + 1); \
|
|
xmm2 = *((__uint128_t*)(src) + 2); \
|
|
xmm3 = *((__uint128_t*)(src) + 3); \
|
|
*((__uint128_t*)(dst) + 0) = xmm0; \
|
|
*((__uint128_t*)(dst) + 1) = xmm1; \
|
|
*((__uint128_t*)(dst) + 2) = xmm2; \
|
|
*((__uint128_t*)(dst) + 3) = xmm3; \
|
|
break; \
|
|
case 5: \
|
|
xmm0 = *((__uint128_t*)(src) + 0); \
|
|
xmm1 = *((__uint128_t*)(src) + 1); \
|
|
xmm2 = *((__uint128_t*)(src) + 2); \
|
|
xmm3 = *((__uint128_t*)(src) + 3); \
|
|
xmm4 = *((__uint128_t*)(src) + 4); \
|
|
*((__uint128_t*)(dst) + 0) = xmm0; \
|
|
*((__uint128_t*)(dst) + 1) = xmm1; \
|
|
*((__uint128_t*)(dst) + 2) = xmm2; \
|
|
*((__uint128_t*)(dst) + 3) = xmm3; \
|
|
*((__uint128_t*)(dst) + 4) = xmm4; \
|
|
break; \
|
|
case 6: \
|
|
xmm0 = *((__uint128_t*)(src) + 0); \
|
|
xmm1 = *((__uint128_t*)(src) + 1); \
|
|
xmm2 = *((__uint128_t*)(src) + 2); \
|
|
xmm3 = *((__uint128_t*)(src) + 3); \
|
|
xmm4 = *((__uint128_t*)(src) + 4); \
|
|
xmm5 = *((__uint128_t*)(src) + 5); \
|
|
*((__uint128_t*)(dst) + 0) = xmm0; \
|
|
*((__uint128_t*)(dst) + 1) = xmm1; \
|
|
*((__uint128_t*)(dst) + 2) = xmm2; \
|
|
*((__uint128_t*)(dst) + 3) = xmm3; \
|
|
*((__uint128_t*)(dst) + 4) = xmm4; \
|
|
*((__uint128_t*)(dst) + 5) = xmm5; \
|
|
break; \
|
|
case 7: \
|
|
xmm0 = *((__uint128_t*)(src) + 0); \
|
|
xmm1 = *((__uint128_t*)(src) + 1); \
|
|
xmm2 = *((__uint128_t*)(src) + 2); \
|
|
xmm3 = *((__uint128_t*)(src) + 3); \
|
|
xmm4 = *((__uint128_t*)(src) + 4); \
|
|
xmm5 = *((__uint128_t*)(src) + 5); \
|
|
xmm6 = *((__uint128_t*)(src) + 6); \
|
|
*((__uint128_t*)(dst) + 0) = xmm0; \
|
|
*((__uint128_t*)(dst) + 1) = xmm1; \
|
|
*((__uint128_t*)(dst) + 2) = xmm2; \
|
|
*((__uint128_t*)(dst) + 3) = xmm3; \
|
|
*((__uint128_t*)(dst) + 4) = xmm4; \
|
|
*((__uint128_t*)(dst) + 5) = xmm5; \
|
|
*((__uint128_t*)(dst) + 6) = xmm6; \
|
|
break; \
|
|
case 8: \
|
|
xmm0 = *((__uint128_t*)(src) + 0); \
|
|
xmm1 = *((__uint128_t*)(src) + 1); \
|
|
xmm2 = *((__uint128_t*)(src) + 2); \
|
|
xmm3 = *((__uint128_t*)(src) + 3); \
|
|
xmm4 = *((__uint128_t*)(src) + 4); \
|
|
xmm5 = *((__uint128_t*)(src) + 5); \
|
|
xmm6 = *((__uint128_t*)(src) + 6); \
|
|
xmm7 = *((__uint128_t*)(src) + 7); \
|
|
*((__uint128_t*)(dst) + 0) = xmm0; \
|
|
*((__uint128_t*)(dst) + 1) = xmm1; \
|
|
*((__uint128_t*)(dst) + 2) = xmm2; \
|
|
*((__uint128_t*)(dst) + 3) = xmm3; \
|
|
*((__uint128_t*)(dst) + 4) = xmm4; \
|
|
*((__uint128_t*)(dst) + 5) = xmm5; \
|
|
*((__uint128_t*)(dst) + 6) = xmm6; \
|
|
*((__uint128_t*)(dst) + 7) = xmm7; \
|
|
break; \
|
|
}\
|
|
*((uint64_t*)((uintptr_t)(dst) + (sz) - 8)) = *((uint64_t*)((uintptr_t)(src) + (sz) - 8)); \
|
|
} while(0)
|
|
|
|
// Note: dst and src must be valid address already
|
|
#define aco_amd64_optimized_memcpy_drop_in(dst, src, sz) do {\
|
|
if(aco_amd64_inline_short_aligned_memcpy_test_ok((dst), (src), (sz))){ \
|
|
aco_amd64_inline_short_aligned_memcpy((dst), (src), (sz)); \
|
|
}else{ \
|
|
memcpy((dst), (src), (sz)); \
|
|
} \
|
|
} while(0)
|
|
|
|
static void aco_default_protector_last_word(void){
|
|
aco_t* co = aco_get_co();
|
|
// do some log about the offending `co`
|
|
fprintf(stderr,"error: aco_default_protector_last_word triggered\n");
|
|
fprintf(stderr, "error: co:%p should call `aco_exit()` instead of direct "
|
|
"`return` in co_fp:%p to finish its execution\n", co, (void*)co->fp);
|
|
assert(0);
|
|
}
|
|
|
|
// aco's Global Thread Local Storage variable `co`
|
|
__thread aco_t* aco_gtls_co;
|
|
static __thread aco_cofuncp_t aco_gtls_last_word_fp = aco_default_protector_last_word;
|
|
|
|
#ifdef __i386__
|
|
static __thread void* aco_gtls_fpucw_mxcsr[2];
|
|
#elif __x86_64__
|
|
static __thread void* aco_gtls_fpucw_mxcsr[1];
|
|
#else
|
|
#error "platform no support yet"
|
|
#endif
|
|
|
|
void aco_thread_init(aco_cofuncp_t last_word_co_fp){
|
|
aco_save_fpucw_mxcsr(aco_gtls_fpucw_mxcsr);
|
|
|
|
if((void*)last_word_co_fp != NULL)
|
|
aco_gtls_last_word_fp = last_word_co_fp;
|
|
}
|
|
|
|
// This function `aco_funcp_protector` should never be
|
|
// called. If it's been called, that means the offending
|
|
// `co` didn't call aco_exit(co) instead of `return` to
|
|
// finish its execution.
|
|
void aco_funcp_protector(void){
|
|
if((void*)(aco_gtls_last_word_fp) != NULL){
|
|
aco_gtls_last_word_fp();
|
|
}else{
|
|
aco_default_protector_last_word();
|
|
}
|
|
assert(0);
|
|
}
|
|
|
|
aco_share_stack_t* aco_share_stack_new(size_t sz){
|
|
return aco_share_stack_new2(sz, 1);
|
|
}
|
|
|
|
#define aco_size_t_safe_add_assert(a,b) do { \
|
|
assert((a)+(b) >= (a)); \
|
|
}while(0)
|
|
|
|
aco_share_stack_t* aco_share_stack_new2(size_t sz, char guard_page_enabled){
|
|
if(sz == 0){
|
|
sz = 1024 * 1024 * 2;
|
|
}
|
|
if(sz < 4096){
|
|
sz = 4096;
|
|
}
|
|
assert(sz > 0);
|
|
|
|
size_t u_pgsz = 0;
|
|
if(guard_page_enabled != 0){
|
|
// although gcc's Built-in Functions to Perform Arithmetic with
|
|
// Overflow Checking is better, but it would require gcc >= 5.0
|
|
long pgsz = sysconf(_SC_PAGESIZE);
|
|
// pgsz must be > 0 && a power of two
|
|
assert(pgsz > 0 && (((pgsz - 1) & pgsz) == 0));
|
|
u_pgsz = (size_t)((unsigned long)pgsz);
|
|
// it should be always true in real life
|
|
assert(u_pgsz == (unsigned long)pgsz && ((u_pgsz << 1) >> 1) == u_pgsz);
|
|
if(sz <= u_pgsz){
|
|
sz = u_pgsz << 1;
|
|
} else {
|
|
size_t new_sz;
|
|
if((sz & (u_pgsz - 1)) != 0){
|
|
new_sz = (sz & (~(u_pgsz - 1)));
|
|
assert(new_sz >= u_pgsz);
|
|
aco_size_t_safe_add_assert(new_sz, (u_pgsz << 1));
|
|
new_sz = new_sz + (u_pgsz << 1);
|
|
assert(sz / u_pgsz + 2 == new_sz / u_pgsz);
|
|
} else {
|
|
aco_size_t_safe_add_assert(sz, u_pgsz);
|
|
new_sz = sz + u_pgsz;
|
|
assert(sz / u_pgsz + 1 == new_sz / u_pgsz);
|
|
}
|
|
sz = new_sz;
|
|
assert((sz / u_pgsz > 1) && ((sz & (u_pgsz - 1)) == 0));
|
|
}
|
|
}
|
|
|
|
aco_share_stack_t* p = malloc(sizeof(aco_share_stack_t));
|
|
assertalloc_ptr(p);
|
|
memset(p, 0, sizeof(aco_share_stack_t));
|
|
|
|
if(guard_page_enabled != 0){
|
|
p->real_ptr = mmap(
|
|
NULL, sz, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0
|
|
);
|
|
assertalloc_bool(p->real_ptr != MAP_FAILED);
|
|
p->guard_page_enabled = 1;
|
|
assert(0 == mprotect(p->real_ptr, u_pgsz, PROT_READ));
|
|
|
|
p->ptr = (void*)(((uintptr_t)p->real_ptr) + u_pgsz);
|
|
p->real_sz = sz;
|
|
assert(sz >= (u_pgsz << 1));
|
|
p->sz = sz - u_pgsz;
|
|
} else {
|
|
//p->guard_page_enabled = 0;
|
|
p->sz = sz;
|
|
p->ptr = malloc(sz);
|
|
assertalloc_ptr(p->ptr);
|
|
}
|
|
|
|
p->owner = NULL;
|
|
#ifdef ACO_USE_VALGRIND
|
|
p->valgrind_stk_id = VALGRIND_STACK_REGISTER(
|
|
p->ptr, (void*)((uintptr_t)p->ptr + p->sz)
|
|
);
|
|
#endif
|
|
#if defined(__i386__) || defined(__x86_64__)
|
|
uintptr_t u_p = (uintptr_t)(p->sz - (sizeof(void*) << 1) + (uintptr_t)p->ptr);
|
|
u_p = (u_p >> 4) << 4;
|
|
p->align_highptr = (void*)u_p;
|
|
p->align_retptr = (void*)(u_p - sizeof(void*));
|
|
*((void**)(p->align_retptr)) = (void*)(aco_funcp_protector_asm);
|
|
assert(p->sz > (16 + (sizeof(void*) << 1) + sizeof(void*)));
|
|
p->align_limit = p->sz - 16 - (sizeof(void*) << 1);
|
|
#else
|
|
#error "platform no support yet"
|
|
#endif
|
|
return p;
|
|
}
|
|
|
|
void aco_share_stack_destroy(aco_share_stack_t* sstk){
|
|
assert(sstk != NULL && sstk->ptr != NULL);
|
|
#ifdef ACO_USE_VALGRIND
|
|
VALGRIND_STACK_DEREGISTER(sstk->valgrind_stk_id);
|
|
#endif
|
|
if(sstk->guard_page_enabled){
|
|
assert(0 == munmap(sstk->real_ptr, sstk->real_sz));
|
|
sstk->real_ptr = NULL;
|
|
sstk->ptr = NULL;
|
|
} else {
|
|
free(sstk->ptr);
|
|
sstk->ptr = NULL;
|
|
}
|
|
free(sstk);
|
|
}
|
|
|
|
aco_t* aco_create(
|
|
aco_t* main_co, aco_share_stack_t* share_stack,
|
|
size_t save_stack_sz, aco_cofuncp_t fp, void* arg
|
|
){
|
|
|
|
aco_t* p = malloc(sizeof(aco_t));
|
|
assertalloc_ptr(p);
|
|
memset(p, 0, sizeof(aco_t));
|
|
|
|
if(main_co != NULL){ // non-main co
|
|
assertptr(share_stack);
|
|
p->share_stack = share_stack;
|
|
#ifdef __i386__
|
|
// POSIX.1-2008 (IEEE Std 1003.1-2008) - General Information - Data Types - Pointer Types
|
|
// http://pubs.opengroup.org/onlinepubs/9699919799.2008edition/functions/V2_chap02.html#tag_15_12_03
|
|
p->reg[ACO_REG_IDX_RETADDR] = (void*)fp;
|
|
// push retaddr
|
|
p->reg[ACO_REG_IDX_SP] = p->share_stack->align_retptr;
|
|
#ifndef ACO_CONFIG_SHARE_FPU_MXCSR_ENV
|
|
p->reg[ACO_REG_IDX_FPU] = aco_gtls_fpucw_mxcsr[0];
|
|
p->reg[ACO_REG_IDX_FPU + 1] = aco_gtls_fpucw_mxcsr[1];
|
|
#endif
|
|
#elif __x86_64__
|
|
p->reg[ACO_REG_IDX_RETADDR] = (void*)fp;
|
|
p->reg[ACO_REG_IDX_SP] = p->share_stack->align_retptr;
|
|
#ifndef ACO_CONFIG_SHARE_FPU_MXCSR_ENV
|
|
p->reg[ACO_REG_IDX_FPU] = aco_gtls_fpucw_mxcsr[0];
|
|
#endif
|
|
#else
|
|
#error "platform no support yet"
|
|
#endif
|
|
p->main_co = main_co;
|
|
p->arg = arg;
|
|
p->fp = fp;
|
|
if(save_stack_sz == 0){
|
|
save_stack_sz = 64;
|
|
}
|
|
p->save_stack.ptr = malloc(save_stack_sz);
|
|
assertalloc_ptr(p->save_stack.ptr);
|
|
p->save_stack.sz = save_stack_sz;
|
|
#if defined(__i386__) || defined(__x86_64__)
|
|
p->save_stack.valid_sz = 0;
|
|
#else
|
|
#error "platform no support yet"
|
|
#endif
|
|
return p;
|
|
} else { // main co
|
|
p->main_co = NULL;
|
|
p->arg = arg;
|
|
p->fp = fp;
|
|
p->share_stack = NULL;
|
|
p->save_stack.ptr = NULL;
|
|
return p;
|
|
}
|
|
assert(0);
|
|
}
|
|
|
|
aco_attr_no_asan
|
|
void aco_resume(aco_t* resume_co){
|
|
assert(resume_co != NULL && resume_co->main_co != NULL
|
|
&& resume_co->is_end == 0
|
|
);
|
|
if(resume_co->share_stack->owner != resume_co){
|
|
if(resume_co->share_stack->owner != NULL){
|
|
aco_t* owner_co = resume_co->share_stack->owner;
|
|
assert(owner_co->share_stack == resume_co->share_stack);
|
|
#if defined(__i386__) || defined(__x86_64__)
|
|
assert(
|
|
(
|
|
(uintptr_t)(owner_co->share_stack->align_retptr)
|
|
>=
|
|
(uintptr_t)(owner_co->reg[ACO_REG_IDX_SP])
|
|
)
|
|
&&
|
|
(
|
|
(uintptr_t)(owner_co->share_stack->align_highptr)
|
|
-
|
|
(uintptr_t)(owner_co->share_stack->align_limit)
|
|
<=
|
|
(uintptr_t)(owner_co->reg[ACO_REG_IDX_SP])
|
|
)
|
|
);
|
|
owner_co->save_stack.valid_sz =
|
|
(uintptr_t)(owner_co->share_stack->align_retptr)
|
|
-
|
|
(uintptr_t)(owner_co->reg[ACO_REG_IDX_SP]);
|
|
if(owner_co->save_stack.sz < owner_co->save_stack.valid_sz){
|
|
free(owner_co->save_stack.ptr);
|
|
owner_co->save_stack.ptr = NULL;
|
|
while(1){
|
|
owner_co->save_stack.sz = owner_co->save_stack.sz << 1;
|
|
assert(owner_co->save_stack.sz > 0);
|
|
if(owner_co->save_stack.sz >= owner_co->save_stack.valid_sz){
|
|
break;
|
|
}
|
|
}
|
|
owner_co->save_stack.ptr = malloc(owner_co->save_stack.sz);
|
|
assertalloc_ptr(owner_co->save_stack.ptr);
|
|
}
|
|
// TODO: optimize the performance penalty of memcpy function call
|
|
// for very short memory span
|
|
if(owner_co->save_stack.valid_sz > 0) {
|
|
#ifdef __x86_64__
|
|
aco_amd64_optimized_memcpy_drop_in(
|
|
owner_co->save_stack.ptr,
|
|
owner_co->reg[ACO_REG_IDX_SP],
|
|
owner_co->save_stack.valid_sz
|
|
);
|
|
#else
|
|
memcpy(
|
|
owner_co->save_stack.ptr,
|
|
owner_co->reg[ACO_REG_IDX_SP],
|
|
owner_co->save_stack.valid_sz
|
|
);
|
|
#endif
|
|
owner_co->save_stack.ct_save++;
|
|
}
|
|
if(owner_co->save_stack.valid_sz > owner_co->save_stack.max_cpsz){
|
|
owner_co->save_stack.max_cpsz = owner_co->save_stack.valid_sz;
|
|
}
|
|
owner_co->share_stack->owner = NULL;
|
|
owner_co->share_stack->align_validsz = 0;
|
|
#else
|
|
#error "platform no support yet"
|
|
#endif
|
|
}
|
|
assert(resume_co->share_stack->owner == NULL);
|
|
#if defined(__i386__) || defined(__x86_64__)
|
|
assert(
|
|
resume_co->save_stack.valid_sz
|
|
<=
|
|
resume_co->share_stack->align_limit - sizeof(void*)
|
|
);
|
|
// TODO: optimize the performance penalty of memcpy function call
|
|
// for very short memory span
|
|
if(resume_co->save_stack.valid_sz > 0) {
|
|
#ifdef __x86_64__
|
|
aco_amd64_optimized_memcpy_drop_in(
|
|
(void*)(
|
|
(uintptr_t)(resume_co->share_stack->align_retptr)
|
|
-
|
|
resume_co->save_stack.valid_sz
|
|
),
|
|
resume_co->save_stack.ptr,
|
|
resume_co->save_stack.valid_sz
|
|
);
|
|
#else
|
|
memcpy(
|
|
(void*)(
|
|
(uintptr_t)(resume_co->share_stack->align_retptr)
|
|
-
|
|
resume_co->save_stack.valid_sz
|
|
),
|
|
resume_co->save_stack.ptr,
|
|
resume_co->save_stack.valid_sz
|
|
);
|
|
#endif
|
|
resume_co->save_stack.ct_restore++;
|
|
}
|
|
if(resume_co->save_stack.valid_sz > resume_co->save_stack.max_cpsz){
|
|
resume_co->save_stack.max_cpsz = resume_co->save_stack.valid_sz;
|
|
}
|
|
resume_co->share_stack->align_validsz = resume_co->save_stack.valid_sz + sizeof(void*);
|
|
resume_co->share_stack->owner = resume_co;
|
|
#else
|
|
#error "platform no support yet"
|
|
#endif
|
|
}
|
|
aco_gtls_co = resume_co;
|
|
acosw(resume_co->main_co, resume_co);
|
|
aco_gtls_co = resume_co->main_co;
|
|
}
|
|
|
|
void aco_destroy(aco_t* co){
|
|
assertptr(co);
|
|
if(aco_is_main_co(co)){
|
|
free(co);
|
|
} else {
|
|
if(co->share_stack->owner == co){
|
|
co->share_stack->owner = NULL;
|
|
co->share_stack->align_validsz = 0;
|
|
}
|
|
free(co->save_stack.ptr);
|
|
co->save_stack.ptr = NULL;
|
|
free(co);
|
|
}
|
|
}
|
|
|