From fdf5225649efd6c679ce734731fe658dc070a369 Mon Sep 17 00:00:00 2001 From: Nicolas Grekas Date: Wed, 22 Apr 2026 20:17:54 +0200 Subject: [PATCH] feat: cross-platform force-kill primitive for stuck PHP threads Introduces a self-contained primitive that wakes a PHP thread parked in a blocking call (sleep, synchronous I/O, etc.) so the graceful drain used by RestartWorkers / DrainWorkers / Shutdown completes promptly instead of waiting for the syscall to return naturally. Design: each PHP thread, at boot from its own TSRM context, hands a force_kill_slot (pointers to its EG(vm_interrupt) and EG(timed_out) atomic bools, plus pthread_t / Windows HANDLE) back to Go via go_frankenphp_store_force_kill_slot. The slot lives on phpThread and is protected by a per-thread RWMutex so the zero-and-release path at thread exit cannot race an in-flight kill. From any goroutine, Go passes the slot back to frankenphp_force_kill_thread, which stores true into both bools (waking the VM at the next opcode boundary, routing through zend_timeout -> "Maximum execution time exceeded") and delivers a platform-specific wake-up: - Linux/FreeBSD: pthread_kill(SIGRTMIN+3) with a no-op handler installed via pthread_once, SA_ONSTACK, no SA_RESTART. Signal delivery causes the in-flight blocking syscall to return EINTR. - Windows: CancelSynchronousIo + QueueUserAPC covers alertable I/O and SleepEx. Non-alertable Sleep (including PHP's usleep) stays uninterruptible. - macOS: atomic-bool-only path. Threads stuck in blocking syscalls wait for the syscall to complete naturally. Reserved signal: SIGRTMIN+3. PHP's pcntl_signal(SIGRTMIN+3, ...) clobbers it; embedders whose own Go code uses that signal must patch the constant. glibc NPTL reserves SIGRTMIN..SIGRTMIN+2. Drain integration: drainWorkerThreads waits drainGracePeriod (5s) for each thread to reach Yielding, then arms force-kill on stragglers and keeps waiting until they yield. phpThread.shutdown does the same. There is no abandon path: if a thread is stuck in a syscall force-kill cannot interrupt (macOS, Windows non-alertable Sleep) the drain blocks until the syscall returns naturally - matching pre-patch behaviour exactly, just typically much faster because force-kill cuts a 60s sleep down to milliseconds. Operators that want a harder bound rely on their orchestrator (systemd, k8s, supervisord) to SIGKILL the process. worker_test.go + testdata/worker-sleep.php exercise the full path: the test marks a file before sleep(60), polls until the worker is proven parked, then asserts RestartWorkers completes within the grace period and that the post-sleep echo never runs (which would mean the VM interrupt was never observed). --- frankenphp.c | 117 ++++++++++++++++++++++++++++++++++++-- frankenphp.h | 33 +++++++++++ phpmainthread.go | 7 +++ phpthread.go | 52 ++++++++++++++++- testdata/worker-sleep.php | 21 +++++++ worker.go | 49 ++++++++++++---- worker_test.go | 66 +++++++++++++++++++++ 7 files changed, 329 insertions(+), 16 deletions(-) create mode 100644 testdata/worker-sleep.php diff --git a/frankenphp.c b/frankenphp.c index 0cc294e397..717b047d81 100644 --- a/frankenphp.c +++ b/frankenphp.c @@ -92,6 +92,78 @@ static bool is_forked_child = false; static void frankenphp_fork_child(void) { is_forked_child = true; } #endif +/* Best-effort force-kill for stuck PHP threads. + * + * Each thread captures &EG(vm_interrupt) / &EG(timed_out) at boot and + * hands them to Go via go_frankenphp_store_force_kill_slot. To kill, + * Go passes the slot back to frankenphp_force_kill_thread, which stores + * true into both bools (the VM bails through zend_timeout() at the next + * opcode boundary) and then wakes any in-flight syscall: + * - Linux/FreeBSD: pthread_kill(SIGRTMIN+3) -> EINTR. + * - Windows: CancelSynchronousIo + QueueUserAPC for alertable I/O + + * SleepEx. Non-alertable Sleep (including PHP's usleep) stays stuck. + * - macOS: atomic-bool only; busy loops bail, blocking syscalls don't. + * + * Reserved signal: SIGRTMIN+3. PHP's pcntl_signal(SIGRTMIN+3, ...) + * clobbers it. glibc NPTL reserves SIGRTMIN..SIGRTMIN+2; embedders with + * their own Go signal usage may need to patch this constant. + * + * The slot lives Go-side on phpThread; the C side has no global table. + * The signal handler is installed once via pthread_once. */ +#ifdef PHP_WIN32 +static void CALLBACK frankenphp_noop_apc(ULONG_PTR param) { (void)param; } +#endif + +#ifdef FRANKENPHP_HAS_KILL_SIGNAL +/* No-op: delivery itself is what unblocks the syscall via EINTR. */ +static void frankenphp_kill_signal_handler(int sig) { (void)sig; } + +static pthread_once_t kill_signal_handler_installed = PTHREAD_ONCE_INIT; +static void install_kill_signal_handler(void) { + /* No SA_RESTART so syscalls return EINTR rather than being restarted. + * SA_ONSTACK guards against an accidental process-level delivery to a + * Go-managed thread, where Go requires the alternate signal stack. */ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = frankenphp_kill_signal_handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_ONSTACK; + sigaction(FRANKENPHP_KILL_SIGNAL, &sa, NULL); +} +#endif + +void frankenphp_force_kill_thread(force_kill_slot slot) { + if (slot.vm_interrupt == NULL) { + /* Boot aborted before the slot was published. */ + return; + } + /* Atomic stores first: by the time the thread wakes (signal-driven or + * natural) the VM sees them and bails through zend_timeout(). */ + zend_atomic_bool_store(slot.timed_out, true); + zend_atomic_bool_store(slot.vm_interrupt, true); + +#ifdef FRANKENPHP_HAS_KILL_SIGNAL + /* ESRCH (thread already exited) / EINVAL are both benign here. */ + pthread_kill(slot.tid, FRANKENPHP_KILL_SIGNAL); +#elif defined(PHP_WIN32) + if (slot.thread_handle != NULL) { + CancelSynchronousIo(slot.thread_handle); + QueueUserAPC((PAPCFUNC)frankenphp_noop_apc, slot.thread_handle, 0); + } +#endif +} + +/* CloseHandle on Windows; no-op on POSIX. */ +void frankenphp_release_thread_for_kill(force_kill_slot slot) { +#ifdef PHP_WIN32 + if (slot.thread_handle != NULL) { + CloseHandle(slot.thread_handle); + } +#else + (void)slot; +#endif +} + void frankenphp_update_local_thread_context(bool is_worker) { is_worker_thread = is_worker; @@ -1065,6 +1137,16 @@ static void *php_thread(void *arg) { snprintf(thread_name, 16, "php-%" PRIxPTR, thread_index); set_thread_name(thread_name); +#ifdef FRANKENPHP_HAS_KILL_SIGNAL + /* The spawning Go-managed M may block realtime signals, which the + * new pthread inherits. Unblock FRANKENPHP_KILL_SIGNAL here so + * force-kill deliveries are not silently dropped. */ + sigset_t unblock; + sigemptyset(&unblock); + sigaddset(&unblock, FRANKENPHP_KILL_SIGNAL); + pthread_sigmask(SIG_UNBLOCK, &unblock, NULL); +#endif + /* Initial allocation of all global PHP memory for this thread */ #ifdef ZTS (void)ts_resource(0); @@ -1073,6 +1155,29 @@ static void *php_thread(void *arg) { #endif #endif + /* Publish this thread's force-kill slot to Go so the graceful-drain + * grace period can wake it from a busy PHP loop or blocking syscall. + * Must run on the PHP thread itself: EG() resolves to its own TSRM + * context and pthread_self() captures the right tid. */ + { + force_kill_slot slot; + memset(&slot, 0, sizeof(slot)); + slot.vm_interrupt = &EG(vm_interrupt); + slot.timed_out = &EG(timed_out); +#ifdef FRANKENPHP_HAS_KILL_SIGNAL + slot.tid = pthread_self(); + pthread_once(&kill_signal_handler_installed, install_kill_signal_handler); +#elif defined(PHP_WIN32) + if (!DuplicateHandle(GetCurrentProcess(), GetCurrentThread(), + GetCurrentProcess(), &slot.thread_handle, 0, FALSE, + DUPLICATE_SAME_ACCESS)) { + /* On failure, force_kill falls back to atomic-bool only. */ + slot.thread_handle = NULL; + } +#endif + go_frankenphp_store_force_kill_slot(thread_index, slot); + } + bool thread_is_healthy = true; bool has_attempted_shutdown = false; @@ -1150,6 +1255,11 @@ static void *php_thread(void *arg) { } zend_end_try(); + /* Must precede ts_free_thread: that frees the TSRM storage backing + * the slot's &EG() pointers. Clearing first means any concurrent + * force-kill either ran before us or sees a zero slot. */ + go_frankenphp_clear_force_kill_slot(thread_index); + /* free all global PHP memory reserved for this thread */ #ifdef ZTS ts_free_thread(); @@ -1158,12 +1268,9 @@ static void *php_thread(void *arg) { /* Thread is healthy, signal to Go that the thread has shut down */ if (thread_is_healthy) { go_frankenphp_on_thread_shutdown(thread_index); - return NULL; } - /* Thread is unhealthy, PHP globals might be in a bad state after a bailout, - * restart the entire thread */ frankenphp_log_message("Restarting unhealthy thread", LOG_WARNING); if (!frankenphp_new_php_thread(thread_index)) { @@ -1265,7 +1372,9 @@ static void *php_main(void *arg) { go_frankenphp_main_thread_is_ready(); - /* channel closed, shutdown gracefully */ + /* channel closed, shutdown gracefully. drainPHPThreads has already + * waited for every PHP thread to exit (state.Done), so SAPI/TSRM + * teardown here is safe. */ frankenphp_sapi_module.shutdown(&frankenphp_sapi_module); sapi_shutdown(); diff --git a/frankenphp.h b/frankenphp.h index 0ea8c80f41..31df007f18 100644 --- a/frankenphp.h +++ b/frankenphp.h @@ -46,6 +46,28 @@ static inline HRESULT LongLongSub(LONGLONG llMinuend, LONGLONG llSubtrahend, #include #include +#ifndef PHP_WIN32 +#include +#include +#endif + +/* Platform capabilities for the force-kill primitive; declared in the + * header so Go (via CGo) gets the correct struct layout too. */ +#if !defined(PHP_WIN32) && defined(SIGRTMIN) +#define FRANKENPHP_HAS_KILL_SIGNAL 1 +#define FRANKENPHP_KILL_SIGNAL (SIGRTMIN + 3) +#endif + +typedef struct { + zend_atomic_bool *vm_interrupt; + zend_atomic_bool *timed_out; +#ifdef FRANKENPHP_HAS_KILL_SIGNAL + pthread_t tid; +#elif defined(PHP_WIN32) + HANDLE thread_handle; +#endif +} force_kill_slot; + #ifndef FRANKENPHP_VERSION #define FRANKENPHP_VERSION dev #endif @@ -193,6 +215,17 @@ void frankenphp_init_thread_metrics(int max_threads); void frankenphp_destroy_thread_metrics(void); size_t frankenphp_get_thread_memory_usage(uintptr_t thread_index); +/* Best-effort force-kill primitives. The slot is populated by each PHP + * thread at boot (an internal helper calls back into Go via + * go_frankenphp_store_force_kill_slot) and lives in the Go-side phpThread. + * force_kill_thread interrupts the Zend VM at the next opcode boundary; + * on POSIX it also delivers SIGRTMIN+3 to the target thread, on Windows + * it calls CancelSynchronousIo + QueueUserAPC. release_thread drops any + * OS-owned resource tied to the slot (currently the Windows thread + * handle). */ +void frankenphp_force_kill_thread(force_kill_slot slot); +void frankenphp_release_thread_for_kill(force_kill_slot slot); + void register_extensions(zend_module_entry **m, int len); #endif diff --git a/phpmainthread.go b/phpmainthread.go index 7f9b8fb947..b892d52f19 100644 --- a/phpmainthread.go +++ b/phpmainthread.go @@ -54,6 +54,8 @@ func initPHPThreads(numThreads int, numMaxThreads int, phpIni map[string]string) return nil, err } + // Must follow start(): maxThreads is only final once + // setAutomaticMaxThreads runs on the main PHP thread (before Ready). C.frankenphp_init_thread_metrics(C.int(mainThread.maxThreads)) // initialize all other threads @@ -79,6 +81,11 @@ func drainPHPThreads() { if mainThread == nil { return // mainThread was never initialized } + // Idempotent: post-drain state is Reserved; a re-entry (e.g. a + // failed-Init cleanup) must not double-close mainThread.done. + if mainThread.state.Is(state.Reserved) { + return + } doneWG := sync.WaitGroup{} doneWG.Add(len(phpThreads)) mainThread.state.Set(state.ShuttingDown) diff --git a/phpthread.go b/phpthread.go index a941de9348..fec8737537 100644 --- a/phpthread.go +++ b/phpthread.go @@ -8,6 +8,7 @@ import ( "runtime" "sync" "sync/atomic" + "time" "unsafe" "github.com/dunglas/frankenphp/internal/state" @@ -25,6 +26,12 @@ type phpThread struct { contextMu sync.RWMutex state *state.ThreadState requestCount atomic.Int64 + // forceKill holds &EG() pointers captured on the PHP thread itself. + // forceKillMu pairs with go_frankenphp_clear_force_kill_slot's write + // lock so a concurrent kill never dereferences pointers freed by + // ts_free_thread. + forceKillMu sync.RWMutex + forceKill C.force_kill_slot } // threadHandler defines how the callbacks from the C thread should be handled @@ -93,7 +100,27 @@ func (thread *phpThread) shutdown() { } close(thread.drainChan) - thread.state.WaitFor(state.Done) + + // Arm force-kill after the grace period to wake any thread stuck in + // a blocking syscall (sleep, blocking I/O). The wait remains + // unbounded - on platforms where force-kill cannot interrupt the + // syscall (macOS, Windows non-alertable Sleep) the thread will exit + // when the syscall completes naturally; the operator's orchestrator + // is responsible for any harder timeout. + done := make(chan struct{}) + go func() { + thread.state.WaitFor(state.Done) + close(done) + }() + select { + case <-done: + case <-time.After(drainGracePeriod): + thread.forceKillMu.RLock() + C.frankenphp_force_kill_thread(thread.forceKill) + thread.forceKillMu.RUnlock() + <-done + } + thread.drainChan = make(chan struct{}) // threads go back to the reserved state from which they can be booted again @@ -203,6 +230,29 @@ func go_frankenphp_after_script_execution(threadIndex C.uintptr_t, exitStatus C. thread.Unpin() } +//export go_frankenphp_store_force_kill_slot +func go_frankenphp_store_force_kill_slot(threadIndex C.uintptr_t, slot C.force_kill_slot) { + thread := phpThreads[threadIndex] + thread.forceKillMu.Lock() + // Release any prior slot's OS resource (Windows HANDLE) before + // overwriting; a phpThread can reboot and re-register. + C.frankenphp_release_thread_for_kill(thread.forceKill) + thread.forceKill = slot + thread.forceKillMu.Unlock() +} + +//export go_frankenphp_clear_force_kill_slot +func go_frankenphp_clear_force_kill_slot(threadIndex C.uintptr_t) { + // Called from C before ts_free_thread on both exit paths. Zeroing + // the slot under the write lock guarantees any concurrent kill + // either completed before we got the lock or sees a zero slot. + thread := phpThreads[threadIndex] + thread.forceKillMu.Lock() + C.frankenphp_release_thread_for_kill(thread.forceKill) + thread.forceKill = C.force_kill_slot{} + thread.forceKillMu.Unlock() +} + //export go_frankenphp_on_thread_shutdown func go_frankenphp_on_thread_shutdown(threadIndex C.uintptr_t) { thread := phpThreads[threadIndex] diff --git a/testdata/worker-sleep.php b/testdata/worker-sleep.php new file mode 100644 index 0000000000..20eeb61bf8 --- /dev/null +++ b/testdata/worker-sleep.php @@ -0,0 +1,21 @@ +