diff --git a/TSRM/TSRM.c b/TSRM/TSRM.c index e99993204b6f..d55a70c02ed2 100644 --- a/TSRM/TSRM.c +++ b/TSRM/TSRM.c @@ -42,6 +42,8 @@ typedef struct { ts_allocate_ctor ctor; ts_allocate_dtor dtor; size_t fast_offset; + /* When set, storage comes from __thread memory instead of being allocated by TSRM. */ + void *(*tls_addr)(void); int done; } tsrm_resource_type; @@ -163,14 +165,19 @@ TSRM_API bool tsrm_startup(int expected_threads, int expected_resources, int deb static void ts_free_resources(tsrm_tls_entry *thread_resources) { + bool own_thread = thread_resources->thread_id == tsrm_thread_id(); + /* Need to destroy in reverse order to respect dependencies. */ for (int i = thread_resources->count - 1; i >= 0; i--) { if (!resource_types_table[i].done) { + if (resource_types_table[i].tls_addr && !own_thread) { + continue; + } if (resource_types_table[i].dtor) { resource_types_table[i].dtor(thread_resources->storage[i]); } - if (!resource_types_table[i].fast_offset) { + if (!resource_types_table[i].fast_offset && !resource_types_table[i].tls_addr) { free(thread_resources->storage[i]); } } @@ -256,7 +263,10 @@ static void tsrm_update_active_threads(void) p->storage = (void *) realloc(p->storage, sizeof(void *)*id_count); for (j=p->count; jthread_id == tsrm_thread_id()); + p->storage[j] = resource_types_table[j].tls_addr(); + } else if (resource_types_table[j].fast_offset) { p->storage[j] = (void *) (((char*)p) + resource_types_table[j].fast_offset); } else { p->storage[j] = (void *) malloc(resource_types_table[j].size); @@ -301,6 +311,7 @@ TSRM_API ts_rsrc_id ts_allocate_id(ts_rsrc_id *rsrc_id, size_t size, ts_allocate resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].ctor = ctor; resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].dtor = dtor; resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].fast_offset = 0; + resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].tls_addr = NULL; resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].done = 0; tsrm_update_active_threads(); @@ -359,6 +370,7 @@ TSRM_API ts_rsrc_id ts_allocate_fast_id(ts_rsrc_id *rsrc_id, size_t *offset, siz resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].ctor = ctor; resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].dtor = dtor; resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].fast_offset = *offset; + resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].tls_addr = NULL; resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].done = 0; tsrm_update_active_threads(); @@ -368,6 +380,41 @@ TSRM_API ts_rsrc_id ts_allocate_fast_id(ts_rsrc_id *rsrc_id, size_t *offset, siz return *rsrc_id; }/*}}}*/ +/* allocates a resource id whose per-thread storage is a native __thread block */ +TSRM_API ts_rsrc_id ts_allocate_tls_id(ts_rsrc_id *rsrc_id, void *(*tls_addr)(void), size_t size, ts_allocate_ctor ctor, ts_allocate_dtor dtor) +{/*{{{*/ + TSRM_ERROR((TSRM_ERROR_LEVEL_CORE, "Obtaining a new TLS resource id, %d bytes", size)); + + tsrm_mutex_lock(tsmm_mutex); + + *rsrc_id = TSRM_SHUFFLE_RSRC_ID(id_count++); + + if (resource_types_table_size < id_count) { + tsrm_resource_type *_tmp; + _tmp = (tsrm_resource_type *) realloc(resource_types_table, sizeof(tsrm_resource_type)*id_count); + if (!_tmp) { + TSRM_ERROR((TSRM_ERROR_LEVEL_ERROR, "Unable to allocate storage for resource")); + *rsrc_id = 0; + tsrm_mutex_unlock(tsmm_mutex); + return 0; + } + resource_types_table = _tmp; + resource_types_table_size = id_count; + } + resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].size = size; + resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].ctor = ctor; + resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].dtor = dtor; + resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].fast_offset = 0; + resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].tls_addr = tls_addr; + resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].done = 0; + + tsrm_update_active_threads(); + tsrm_mutex_unlock(tsmm_mutex); + + TSRM_ERROR((TSRM_ERROR_LEVEL_CORE, "Successfully allocated new TLS resource id %d", *rsrc_id)); + return *rsrc_id; +}/*}}}*/ + static void set_thread_local_storage_resource_to(tsrm_tls_entry *thread_resource) { tsrm_tls_set(thread_resource); @@ -397,7 +444,9 @@ static void allocate_new_resource(tsrm_tls_entry **thread_resources_ptr, THREAD_ if (resource_types_table[i].done) { (*thread_resources_ptr)->storage[i] = NULL; } else { - if (resource_types_table[i].fast_offset) { + if (resource_types_table[i].tls_addr) { + (*thread_resources_ptr)->storage[i] = resource_types_table[i].tls_addr(); + } else if (resource_types_table[i].fast_offset) { (*thread_resources_ptr)->storage[i] = (void *) (((char*)(*thread_resources_ptr)) + resource_types_table[i].fast_offset); } else { (*thread_resources_ptr)->storage[i] = (void *) malloc(resource_types_table[i].size); @@ -485,7 +534,8 @@ TSRM_API void *ts_resource_ex(ts_rsrc_id id, THREAD_T *th_id) /* In case that extensions don't use the pointer passed from the dtor, but incorrectly * use the global pointer, we need to setup the global pointer temporarily here. */ set_thread_local_storage_resource_to(thread_resources); - /* Free up the old resource from the old thread instance */ + /* Dead thread, recycled id: already freed, so just zero it. */ + thread_resources->thread_id = 0; ts_free_resources(thread_resources); free(thread_resources); /* Allocate a new resource at the same point in the linked list, and relink the next pointer */ @@ -559,7 +609,7 @@ void ts_free_id(ts_rsrc_id id) if (resource_types_table[rsrc_id].dtor) { resource_types_table[rsrc_id].dtor(p->storage[rsrc_id]); } - if (!resource_types_table[rsrc_id].fast_offset) { + if (!resource_types_table[rsrc_id].fast_offset && !resource_types_table[rsrc_id].tls_addr) { free(p->storage[rsrc_id]); } } diff --git a/TSRM/TSRM.h b/TSRM/TSRM.h index ea13552c8374..237cb8fc1c4c 100644 --- a/TSRM/TSRM.h +++ b/TSRM/TSRM.h @@ -93,6 +93,8 @@ TSRM_API ts_rsrc_id ts_allocate_id(ts_rsrc_id *rsrc_id, size_t size, ts_allocate /* Fast resource in reserved (pre-allocated) space */ TSRM_API void tsrm_reserve(size_t size); TSRM_API ts_rsrc_id ts_allocate_fast_id(ts_rsrc_id *rsrc_id, size_t *offset, size_t size, ts_allocate_ctor ctor, ts_allocate_dtor dtor); +/* Must be called at startup before any other thread exists. */ +TSRM_API ts_rsrc_id ts_allocate_tls_id(ts_rsrc_id *rsrc_id, void *(*tls_addr)(void), size_t size, ts_allocate_ctor ctor, ts_allocate_dtor dtor); /* fetches the requested resource for the current thread */ TSRM_API void *ts_resource_ex(ts_rsrc_id id, THREAD_T *th_id); @@ -155,7 +157,7 @@ TSRM_API bool tsrm_is_managed_thread(void); #if !__has_attribute(tls_model) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__MUSL__) || defined(__HAIKU__) # define TSRM_TLS_MODEL_ATTR # define TSRM_TLS_MODEL_DEFAULT -#elif __PIC__ +#elif __PIC__ && !defined(__PIE__) # define TSRM_TLS_MODEL_ATTR __attribute__((tls_model("initial-exec"))) # define TSRM_TLS_MODEL_INITIAL_EXEC #else diff --git a/Zend/Zend.m4 b/Zend/Zend.m4 index d0c682e8e659..319ddca1e025 100644 --- a/Zend/Zend.m4 +++ b/Zend/Zend.m4 @@ -178,6 +178,11 @@ AC_MSG_RESULT([$ZEND_ZTS]) AS_VAR_IF([ZEND_ZTS], [yes], [ AC_DEFINE([ZTS], [1], [Define to 1 if thread safety (ZTS) is enabled.]) AS_VAR_APPEND([CFLAGS], [" -DZTS"]) + + dnl -mtls-size=12 drops the dead high-bits offset add from TLS access, + dnl valid while the thread-local block stays under 4 KiB. + AX_CHECK_COMPILE_FLAG([-mtls-size=12], + [AS_VAR_APPEND([CFLAGS], [" -mtls-size=12"])]) ]) AC_MSG_CHECKING([whether to enable Zend debugging]) diff --git a/Zend/zend.c b/Zend/zend.c index f16b1a30dbbc..cb403ea0469d 100644 --- a/Zend/zend.c +++ b/Zend/zend.c @@ -52,8 +52,12 @@ static bool startup_done = false; #ifdef ZTS ZEND_API int compiler_globals_id; ZEND_API int executor_globals_id; -ZEND_API size_t compiler_globals_offset; -ZEND_API size_t executor_globals_offset; +ZEND_TLS_API TSRM_TLS TSRM_TLS_MODEL_ATTR zend_executor_globals executor_globals_tls; +ZEND_TLS_API TSRM_TLS TSRM_TLS_MODEL_ATTR zend_compiler_globals compiler_globals_tls; +/* ts_allocate_tls_id takes a callback so each thread resolves its own block. + * A plain &..._tls would capture only the registering thread's address. */ +static void *executor_globals_tls_addr(void) { return &executor_globals_tls; } +static void *compiler_globals_tls_addr(void) { return &compiler_globals_tls; } static HashTable *global_function_table = NULL; static HashTable *global_class_table = NULL; static HashTable *global_constants_table = NULL; @@ -1019,8 +1023,8 @@ void zend_startup(zend_utility_functions *utility_functions) /* {{{ */ zend_init_rsrc_list_dtors(); #ifdef ZTS - ts_allocate_fast_id(&compiler_globals_id, &compiler_globals_offset, sizeof(zend_compiler_globals), (ts_allocate_ctor) compiler_globals_ctor, (ts_allocate_dtor) compiler_globals_dtor); - ts_allocate_fast_id(&executor_globals_id, &executor_globals_offset, sizeof(zend_executor_globals), (ts_allocate_ctor) executor_globals_ctor, (ts_allocate_dtor) executor_globals_dtor); + ts_allocate_tls_id(&compiler_globals_id, compiler_globals_tls_addr, sizeof(zend_compiler_globals), (ts_allocate_ctor) compiler_globals_ctor, (ts_allocate_dtor) compiler_globals_dtor); + ts_allocate_tls_id(&executor_globals_id, executor_globals_tls_addr, sizeof(zend_executor_globals), (ts_allocate_ctor) executor_globals_ctor, (ts_allocate_dtor) executor_globals_dtor); ts_allocate_fast_id(&language_scanner_globals_id, &language_scanner_globals_offset, sizeof(zend_php_scanner_globals), (ts_allocate_ctor) php_scanner_globals_ctor, NULL); ts_allocate_fast_id(&ini_scanner_globals_id, &ini_scanner_globals_offset, sizeof(zend_ini_scanner_globals), (ts_allocate_ctor) ini_scanner_globals_ctor, NULL); compiler_globals = ts_resource(compiler_globals_id); diff --git a/Zend/zend_globals.h b/Zend/zend_globals.h index 8257df32e831..8402faaf8116 100644 --- a/Zend/zend_globals.h +++ b/Zend/zend_globals.h @@ -51,8 +51,6 @@ BEGIN_EXTERN_C() ZEND_API extern int compiler_globals_id; ZEND_API extern int executor_globals_id; -ZEND_API extern size_t compiler_globals_offset; -ZEND_API extern size_t executor_globals_offset; END_EXTERN_C() #endif diff --git a/Zend/zend_globals_macros.h b/Zend/zend_globals_macros.h index bde10a0989d1..b1dec1ffb6e3 100644 --- a/Zend/zend_globals_macros.h +++ b/Zend/zend_globals_macros.h @@ -26,11 +26,26 @@ typedef struct _zend_executor_globals zend_executor_globals; typedef struct _zend_php_scanner_globals zend_php_scanner_globals; typedef struct _zend_ini_scanner_globals zend_ini_scanner_globals; +#ifdef ZEND_WIN32 +# define ZEND_TLS_API +# ifdef LIBZEND_EXPORTS +# define ZEND_TLS_DIRECT 1 +# endif +#else +# define ZEND_TLS_API ZEND_API +# define ZEND_TLS_DIRECT 1 +#endif + BEGIN_EXTERN_C() /* Compiler */ #ifdef ZTS -# define CG(v) ZEND_TSRMG_FAST(compiler_globals_offset, zend_compiler_globals *, v) +# ifdef ZEND_TLS_DIRECT +extern ZEND_TLS_API TSRM_TLS TSRM_TLS_MODEL_ATTR zend_compiler_globals compiler_globals_tls; +# define CG(v) (compiler_globals_tls.v) +# else +# define CG(v) ZEND_TSRMG(compiler_globals_id, zend_compiler_globals *, v) +# endif #else # define CG(v) (compiler_globals.v) extern ZEND_API struct _zend_compiler_globals compiler_globals; @@ -40,7 +55,12 @@ ZEND_API int zendparse(void); /* Executor */ #ifdef ZTS -# define EG(v) ZEND_TSRMG_FAST(executor_globals_offset, zend_executor_globals *, v) +# ifdef ZEND_TLS_DIRECT +extern ZEND_TLS_API TSRM_TLS TSRM_TLS_MODEL_ATTR zend_executor_globals executor_globals_tls; +# define EG(v) (executor_globals_tls.v) +# else +# define EG(v) ZEND_TSRMG(executor_globals_id, zend_executor_globals *, v) +# endif #else # define EG(v) (executor_globals.v) extern ZEND_API zend_executor_globals executor_globals; diff --git a/ext/opcache/jit/ir/ir_aarch64.dasc b/ext/opcache/jit/ir/ir_aarch64.dasc index fc4bb84f1e05..04d46cf0dfa6 100644 --- a/ext/opcache/jit/ir/ir_aarch64.dasc +++ b/ext/opcache/jit/ir/ir_aarch64.dasc @@ -5868,8 +5868,12 @@ static void ir_emit_tls(ir_ctx *ctx, ir_ref def, ir_insn *insn) | ldr Rx(reg), [Rx(reg), #insn->op3] || } ||# else +|| /* op2 == 0 with no index requests the bare thread pointer (used to form +|| * &EG/&CG with an add); a real TLS var never sits at tprel offset 0. */ +|| if (insn->op2 != 0 || insn->op3 != IR_NULL) { ||//??? IR_ASSERT(insn->op2 <= LDR_STR_PIMM64); -| ldr Rx(reg), [Rx(reg), #insn->op2] +| ldr Rx(reg), [Rx(reg), #insn->op2] +|| } ||# endif ||#endif if (IR_REG_SPILLED(ctx->regs[def][0])) { diff --git a/ext/opcache/jit/zend_jit_ir.c b/ext/opcache/jit/zend_jit_ir.c index cf43d3ad840f..91548eca45c3 100644 --- a/ext/opcache/jit/zend_jit_ir.c +++ b/ext/opcache/jit/zend_jit_ir.c @@ -201,21 +201,35 @@ static uint32_t default_mflags = 0; static bool delayed_call_chain = false; // TODO: remove this var (use jit->delayed_call_level) ??? #ifdef ZTS -static size_t tsrm_ls_cache_tcb_offset = 0; -static size_t tsrm_tls_index = -1; -static size_t tsrm_tls_offset = -1; - -# define EG_TLS_OFFSET(field) \ - (executor_globals_offset + offsetof(zend_executor_globals, field)) - -# define CG_TLS_OFFSET(field) \ - (compiler_globals_offset + offsetof(zend_compiler_globals, field)) +static size_t eg_tls_tcb_offset = 0; +static size_t cg_tls_tcb_offset = 0; +/* gottpoff yields the offset from the %fs-based thread pointer that ir_TLS(0) + * loads. */ +# if defined(__ELF__) && defined(__x86_64__) && defined(__GNUC__) && !defined(TSRM_TLS_MODEL_DEFAULT) +# define ZEND_JIT_TLS_TCB_OFFSET(sym) __extension__({ \ + size_t _off; \ + __asm__ ("movq " #sym "@gottpoff(%%rip),%0" : "=r" (_off)); \ + _off; \ + }) +# elif defined(__ELF__) && defined(__aarch64__) && !defined(__APPLE__) && \ + (defined(__GNUC__) || defined(__clang__)) +/* The TLS variable sits at a fixed offset from tpidr_el0 (the thread pointer + * the JIT reads with mrs); compute it once on the main thread. Subtracting the + * thread pointer is model-independent (works for both local- and initial-exec) + * and matches tsrm_get_ls_cache_tcb_offset()'s tprel reasoning. */ +# define ZEND_JIT_TLS_TCB_OFFSET(sym) __extension__({ \ + char *_tp; \ + __asm__ ("mrs %0, tpidr_el0" : "=r" (_tp)); \ + (size_t)((char*)&(sym) - _tp); \ + }) +# else +# define ZEND_JIT_TLS_TCB_OFFSET(sym) ((size_t)0) +# endif # define jit_EG(_field) \ - ir_ADD_OFFSET(jit_TLS(jit), EG_TLS_OFFSET(_field)) - + ir_ADD_OFFSET(jit_EG_base(jit), offsetof(zend_executor_globals, _field)) # define jit_CG(_field) \ - ir_ADD_OFFSET(jit_TLS(jit), CG_TLS_OFFSET(_field)) + ir_ADD_OFFSET(jit_CG_base(jit), offsetof(zend_compiler_globals, _field)) #else @@ -298,7 +312,9 @@ typedef struct _zend_jit_ctx { uint32_t delayed_call_level; int b; /* current basic block number or -1 */ #ifdef ZTS - ir_ref tls; + ir_ref tp; /* cached thread pointer for &EG/&CG */ + ir_ref eg_tls; /* cached base of __thread executor_globals_tls */ + ir_ref cg_tls; /* cached base of __thread compiler_globals_tls */ #endif ir_ref fp; ir_ref poly_func_ref; /* restored from parent trace snapshot */ @@ -489,41 +505,60 @@ static const char* zend_reg_name(int8_t reg) /* IR helpers */ #ifdef ZTS -static void * ZEND_FASTCALL zend_jit_get_tsrm_ls_cache(void) +static void * ZEND_FASTCALL zend_jit_get_eg_tls(void) +{ + return &executor_globals_tls; +} +static void * ZEND_FASTCALL zend_jit_get_cg_tls(void) { - return _tsrm_ls_cache; + return &compiler_globals_tls; } -static ir_ref jit_TLS(zend_jit_ctx *jit) +/* Walk the control chain back from the current point: reuse the cached ref if we + * reach it (it still dominates here), but bail at a block start or a call, since + * the cached value lives in a caller-saved register that a call would clobber. */ +static ir_ref jit_tls_reuse(zend_jit_ctx *jit, ir_ref cached) { - ZEND_ASSERT(jit->ctx.control); - if (jit->tls) { - /* Emit "TLS" once for basic block */ - ir_insn *insn; - ir_ref ref = jit->ctx.control; + ir_ref ref = jit->ctx.control; - while (1) { - if (ref == jit->tls) { - return jit->tls; - } - insn = &jit->ctx.ir_base[ref]; - if (insn->op >= IR_START || insn->op == IR_CALL) { - break; - } - ref = insn->op1; + while (cached) { + if (ref == cached) { + return cached; + } + ir_insn *insn = &jit->ctx.ir_base[ref]; + if (insn->op >= IR_START || insn->op == IR_CALL) { + break; } + ref = insn->op1; } + return IR_UNUSED; +} - if (tsrm_ls_cache_tcb_offset == 0 && tsrm_tls_index == -1) { - jit->tls = ir_CALL(IR_ADDR, ir_CONST_FC_FUNC(zend_jit_get_tsrm_ls_cache)); - } else { - jit->tls = ir_TLS( - tsrm_ls_cache_tcb_offset ? tsrm_ls_cache_tcb_offset : tsrm_tls_index, - tsrm_ls_cache_tcb_offset ? IR_NULL : tsrm_tls_offset); +/* Thread pointer, cached per basic block, used to form &EG/&CG with an add. */ +static ir_ref jit_TP(zend_jit_ctx *jit) +{ + ZEND_ASSERT(jit->ctx.control); + if (!jit_tls_reuse(jit, jit->tp)) { + jit->tp = ir_TLS(0, IR_NULL); } + return jit->tp; +} - return jit->tls; +/* Used where the TCB offset is unknown: resolve the base via a cached call. */ +static ir_ref jit_GLOBALS_TLS_call(zend_jit_ctx *jit, ir_ref *cache, const void *fn) +{ + ZEND_ASSERT(jit->ctx.control); + if (!jit_tls_reuse(jit, *cache)) { + *cache = ir_CALL(IR_ADDR, ir_CONST_FC_FUNC(fn)); + } + return *cache; } +# define jit_EG_base(jit) (eg_tls_tcb_offset \ + ? ir_ADD_OFFSET(jit_TP(jit), eg_tls_tcb_offset) \ + : jit_GLOBALS_TLS_call((jit), &(jit)->eg_tls, zend_jit_get_eg_tls)) +# define jit_CG_base(jit) (cg_tls_tcb_offset \ + ? ir_ADD_OFFSET(jit_TP(jit), cg_tls_tcb_offset) \ + : jit_GLOBALS_TLS_call((jit), &(jit)->cg_tls, zend_jit_get_cg_tls)) #endif static ir_ref jit_CONST_ADDR(zend_jit_ctx *jit, uintptr_t addr) @@ -2820,7 +2855,9 @@ static void zend_jit_init_ctx(zend_jit_ctx *jit, uint32_t flags) delayed_call_chain = false; jit->b = -1; #ifdef ZTS - jit->tls = IR_UNUSED; + jit->tp = IR_UNUSED; + jit->eg_tls = IR_UNUSED; + jit->cg_tls = IR_UNUSED; #endif jit->fp = IR_UNUSED; jit->poly_func_ref = IR_UNUSED; @@ -3214,7 +3251,8 @@ static void zend_jit_setup_disasm(void) REGISTER_DATA(CG(map_ptr_base)); #else /* ZTS */ - REGISTER_HELPER(zend_jit_get_tsrm_ls_cache); + REGISTER_HELPER(zend_jit_get_eg_tls); + REGISTER_HELPER(zend_jit_get_cg_tls); #endif #endif } @@ -3425,15 +3463,8 @@ static void zend_jit_setup(bool reattached) #endif #ifdef ZTS - zend_result result = zend_jit_resolve_tsrm_ls_cache_offsets( - &tsrm_ls_cache_tcb_offset, - &tsrm_tls_index, - &tsrm_tls_offset - ); - if (result == FAILURE) { - zend_accel_error(ACCEL_LOG_INFO, - "Could not get _tsrm_ls_cache offsets, will fallback to runtime resolution"); - } + eg_tls_tcb_offset = ZEND_JIT_TLS_TCB_OFFSET(executor_globals_tls); + cg_tls_tcb_offset = ZEND_JIT_TLS_TCB_OFFSET(compiler_globals_tls); #endif #if !defined(ZEND_WIN32) && !defined(IR_TARGET_AARCH64)