diff --git a/caddy/admin.go b/caddy/admin.go index 8515f11326..8d7b7f3df3 100644 --- a/caddy/admin.go +++ b/caddy/admin.go @@ -39,7 +39,13 @@ func (admin *FrankenPHPAdmin) restartWorkers(w http.ResponseWriter, r *http.Requ return admin.error(http.StatusMethodNotAllowed, fmt.Errorf("method not allowed")) } - frankenphp.RestartWorkers() + if err := frankenphp.RestartWorkers(); err != nil { + // Restart is incomplete: at least one worker thread was stuck in + // an uninterruptible blocking call and did not reload code. Do + // not let the admin endpoint lie to automation with a 200. + caddy.Log().Sugar().Errorf("workers restart incomplete: %v", err) + return admin.error(http.StatusInternalServerError, err) + } caddy.Log().Info("workers restarted from admin api") admin.success(w, "workers restarted successfully\n") diff --git a/frankenphp.c b/frankenphp.c index 0cc294e397..4ea23c0e4e 100644 --- a/frankenphp.c +++ b/frankenphp.c @@ -92,6 +92,148 @@ static bool is_forked_child = false; static void frankenphp_fork_child(void) { is_forked_child = true; } #endif +/* Best-effort force-kill for PHP threads after the graceful-drain grace + * period. Each thread captures pointers to its own executor_globals' + * vm_interrupt and timed_out atomic bools at boot and hands them back to + * Go via go_frankenphp_store_force_kill_slot. From any goroutine, the + * Go side passes that slot back to frankenphp_force_kill_thread, which + * stores true into both bools, waking the VM at the next opcode boundary + * and unwinding the thread through zend_timeout(). + * + * On platforms with POSIX realtime signals (Linux, FreeBSD), force-kill + * also delivers SIGRTMIN+3 to the target thread so any in-flight blocking + * syscall (select, sleep, nanosleep, blocking I/O without SA_RESTART) + * returns EINTR and the VM gets a chance to observe the atomic bools on + * the next opcode. On Windows, CancelSynchronousIo + QueueUserAPC does + * the equivalent for alertable I/O and SleepEx. Non-alertable Sleep() + * (including PHP's usleep on Windows) stays uninterruptible - the VM + * must wait for it to return naturally before bailing. + * + * macOS has no realtime signals exposed to user-space, so the atomic + * bool path is the only mechanism there: threads busy-looping in PHP + * are killed promptly, threads stuck in blocking syscalls wait to + * return on their own. + * + * JIT caveat: when the OPcache JIT is enabled, some hot code paths do + * not check vm_interrupt between opcodes. A thread stuck in a + * JIT-compiled busy loop may not observe the atomic-bool store at all + * (see https://github.com/php/php-src/issues/21267). The syscall- + * interruption path (signal -> EINTR) still works since the kernel + * wakes the thread regardless of JIT state, so the regression surface + * is pure-PHP busy loops under JIT. Those fall through to the abandon + * path after forceKillDeadline. + * + * Signal number reservation: SIGRTMIN+3 is reserved by FrankenPHP for + * force-kill. If a PHP user script registers its own handler via + * pcntl_signal(SIGRTMIN+3, ...), it clobbers ours and force-kill stops + * working for threads it runs on. Projects embedding FrankenPHP + * alongside their own Go code that also uses that signal must choose a + * different one here. glibc's NPTL reserves SIGRTMIN..SIGRTMIN+2 for + * its own use, so do not move this offset downward. + * + * The slot lives in the Go-side phpThread struct - there is no C-side + * array or init/destroy dance. Signal handler installation happens once + * via pthread_once the first time a thread registers. */ +#ifdef PHP_WIN32 +static void CALLBACK frankenphp_noop_apc(ULONG_PTR param) { (void)param; } +#endif + +#ifdef FRANKENPHP_HAS_KILL_SIGNAL +/* No-op handler: signal delivery is sufficient on its own because it + * forces the in-flight syscall to return EINTR. The VM then observes + * vm_interrupt on the next opcode and unwinds via zend_timeout(). */ +static void frankenphp_kill_signal_handler(int sig) { (void)sig; } + +static pthread_once_t kill_signal_handler_installed = PTHREAD_ONCE_INIT; +static void install_kill_signal_handler(void) { + /* Install the no-op handler process-wide without SA_RESTART so blocking + * syscalls return EINTR when the signal is delivered rather than being + * transparently restarted by libc. SA_ONSTACK is set defensively: the + * signal targets non-Go pthreads via pthread_kill, but if it's ever + * delivered to a Go-managed thread (e.g. through accidental process- + * level raise), Go requires the handler to run on the alternate signal + * stack to avoid corrupting the goroutine's. */ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = frankenphp_kill_signal_handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_ONSTACK; + sigaction(FRANKENPHP_KILL_SIGNAL, &sa, NULL); +} +#endif + +/* shutdown_in_progress is toggled by the Go side through + * frankenphp_set_shutdown_in_progress(). It is the only honest signal the + * unhealthy-thread restart path has to tell "we are tearing the runtime + * down, do not respawn" apart from normal operation - thread_metrics is + * never NULL anymore because Shutdown intentionally leaves it allocated + * for abandoned threads still writing into it. */ +static zend_atomic_bool shutdown_in_progress; + +void frankenphp_set_shutdown_in_progress(bool v) { + zend_atomic_bool_store(&shutdown_in_progress, v); +} + +/* Called by each PHP thread at boot, from its own TSRM context, so that + * the EG-backed addresses resolve to the thread's private executor_globals + * and the captured thread identity refers to itself. Hands the slot to + * the Go side via go_frankenphp_store_force_kill_slot; the slot's + * lifetime is the phpThread's. */ +void frankenphp_register_thread_for_kill(uintptr_t idx) { + force_kill_slot slot; + memset(&slot, 0, sizeof(slot)); + slot.vm_interrupt = &EG(vm_interrupt); + slot.timed_out = &EG(timed_out); +#ifdef FRANKENPHP_HAS_KILL_SIGNAL + slot.tid = pthread_self(); + pthread_once(&kill_signal_handler_installed, install_kill_signal_handler); +#elif defined(PHP_WIN32) + if (!DuplicateHandle(GetCurrentProcess(), GetCurrentThread(), + GetCurrentProcess(), &slot.thread_handle, 0, FALSE, + DUPLICATE_SAME_ACCESS)) { + /* DuplicateHandle can fail under resource pressure; leave the handle + * NULL so force_kill_thread falls back to the atomic-bool path only. */ + slot.thread_handle = NULL; + } +#endif + go_frankenphp_store_force_kill_slot(idx, slot); +} + +void frankenphp_force_kill_thread(force_kill_slot slot) { + if (slot.vm_interrupt == NULL) { + /* Thread never reached register_thread_for_kill (aborted during boot). */ + return; + } + /* Set the atomic bools first so that by the time the thread wakes up - + * whether from our signal/APC or naturally - the VM sees them and + * routes through zend_timeout() -> "Maximum execution time exceeded". */ + zend_atomic_bool_store(slot.timed_out, true); + zend_atomic_bool_store(slot.vm_interrupt, true); + +#ifdef FRANKENPHP_HAS_KILL_SIGNAL + /* Return value intentionally ignored: ESRCH (thread already exited) and + * EINVAL are both benign - there is simply nothing to unblock. */ + pthread_kill(slot.tid, FRANKENPHP_KILL_SIGNAL); +#elif defined(PHP_WIN32) + if (slot.thread_handle != NULL) { + CancelSynchronousIo(slot.thread_handle); + QueueUserAPC((PAPCFUNC)frankenphp_noop_apc, slot.thread_handle, 0); + } +#endif +} + +/* Releases any OS resource tied to the slot (currently: CloseHandle on + * Windows). Called by the Go side when a phpThread is torn down. */ +void frankenphp_release_thread_for_kill(force_kill_slot slot) { +#ifdef PHP_WIN32 + if (slot.thread_handle != NULL) { + CloseHandle(slot.thread_handle); + } +#else + (void)slot; +#endif +} + void frankenphp_update_local_thread_context(bool is_worker) { is_worker_thread = is_worker; @@ -1065,6 +1207,23 @@ static void *php_thread(void *arg) { snprintf(thread_name, 16, "php-%" PRIxPTR, thread_index); set_thread_name(thread_name); + /* Tell the Go side a new native thread is entering the main loop so + * initPHPThreads can Wait() for abandoned threads from a previous + * Init cycle to fully unwind before reassigning phpThreads. Paired + * with go_frankenphp_thread_exited() at the single exit: label below. */ + go_frankenphp_thread_spawned(); + +#ifdef FRANKENPHP_HAS_KILL_SIGNAL + /* pthread_create inherits the caller's signal mask. frankenphp_new_php_thread + * is typically called from a goroutine pinned to a Go-managed M whose mask + * may block realtime signals. Explicitly unblock FRANKENPHP_KILL_SIGNAL so + * force-kill delivery is not silently discarded on this thread. */ + sigset_t unblock; + sigemptyset(&unblock); + sigaddset(&unblock, FRANKENPHP_KILL_SIGNAL); + pthread_sigmask(SIG_UNBLOCK, &unblock, NULL); +#endif + /* Initial allocation of all global PHP memory for this thread */ #ifdef ZTS (void)ts_resource(0); @@ -1073,6 +1232,11 @@ static void *php_thread(void *arg) { #endif #endif + /* Register this thread's vm_interrupt/timed_out addresses so the Go side + * can force-kill it after the graceful-drain grace period if it gets stuck + * in a busy PHP loop. */ + frankenphp_register_thread_for_kill(thread_index); + bool thread_is_healthy = true; bool has_attempted_shutdown = false; @@ -1150,6 +1314,15 @@ static void *php_thread(void *arg) { } zend_end_try(); + /* Clear the force-kill slot BEFORE ts_free_thread: that call frees + * the TSRM storage that &EG(vm_interrupt) / &EG(timed_out) point at. + * Clearing afterwards (even under a write lock) would leave a window + * where a concurrent delivery reads the still-populated slot and + * writes into freed memory. Applies to both the healthy exit and the + * unhealthy-restart path below so every call to force_kill_thread + * sees either a valid or a zero-valued slot. */ + go_frankenphp_clear_force_kill_slot(thread_index); + /* free all global PHP memory reserved for this thread */ #ifdef ZTS ts_free_thread(); @@ -1158,12 +1331,20 @@ static void *php_thread(void *arg) { /* Thread is healthy, signal to Go that the thread has shut down */ if (thread_is_healthy) { go_frankenphp_on_thread_shutdown(thread_index); - - return NULL; + goto exit; } /* Thread is unhealthy, PHP globals might be in a bad state after a bailout, - * restart the entire thread */ + * restart the entire thread - unless the Go side has already declared the + * runtime to be shutting down via frankenphp_set_shutdown_in_progress(). + * Respawning past that point would hand a fresh pthread a phpThreads + * slice that drainPHPThreads has already stopped tracking. */ + if (zend_atomic_bool_load(&shutdown_in_progress)) { + frankenphp_log_message( + "Unhealthy thread unwinding after Shutdown; not restarting", + LOG_WARNING); + goto exit; + } frankenphp_log_message("Restarting unhealthy thread", LOG_WARNING); if (!frankenphp_new_php_thread(thread_index)) { @@ -1171,6 +1352,12 @@ static void *php_thread(void *arg) { frankenphp_log_message("Failed to restart an unhealthy thread", LOG_ERR); } +exit: + /* Single exit point: every path above that took the spawned() Add must + * route through here so lingeringThreads.Wait() in initPHPThreads can + * observe termination. Adding a new return above without going through + * exit would leak one Add across Init cycles. */ + go_frankenphp_thread_exited(); return NULL; } @@ -1265,17 +1452,25 @@ static void *php_main(void *arg) { go_frankenphp_main_thread_is_ready(); - /* channel closed, shutdown gracefully */ - frankenphp_sapi_module.shutdown(&frankenphp_sapi_module); - - sapi_shutdown(); + /* channel closed, shutdown gracefully. If an abandoned PHP thread is + * still alive in a blocked syscall (RestartWorkers/Shutdown gave up + * after the force-kill deadline), wait a bounded window for it to + * unwind before running SAPI/TSRM teardown. On timeout, skip teardown + * entirely so the late-unwinding thread cannot touch freed state via + * ts_free_thread / php_request_shutdown (zend_catch) / SAPI callbacks. + * Process exit will reclaim the leaked state. */ + if (go_frankenphp_can_teardown()) { + frankenphp_sapi_module.shutdown(&frankenphp_sapi_module); + + sapi_shutdown(); #ifdef ZTS - tsrm_shutdown(); + tsrm_shutdown(); #endif - if (frankenphp_sapi_module.ini_entries) { - free((char *)frankenphp_sapi_module.ini_entries); - frankenphp_sapi_module.ini_entries = NULL; + if (frankenphp_sapi_module.ini_entries) { + free((char *)frankenphp_sapi_module.ini_entries); + frankenphp_sapi_module.ini_entries = NULL; + } } go_frankenphp_shutdown_main_thread(); @@ -1470,6 +1665,12 @@ int frankenphp_reset_opcache(void) { int frankenphp_get_current_memory_limit() { return PG(memory_limit); } void frankenphp_init_thread_metrics(int max_threads) { + /* Free any allocation left over from a prior Init: Shutdown no longer + * calls frankenphp_destroy_thread_metrics (abandoned threads may still + * be writing into the array when the blocked syscall unwinds), but + * initPHPThreads waits on lingeringThreads before reaching us so any + * such abandoned thread has already exited by the time we reallocate. */ + free(thread_metrics); thread_metrics = calloc(max_threads, sizeof(frankenphp_thread_metrics)); } diff --git a/frankenphp.go b/frankenphp.go index 52246d01c7..408811bf5c 100644 --- a/frankenphp.go +++ b/frankenphp.go @@ -313,6 +313,16 @@ func Init(options ...Option) error { mainThread, err := initPHPThreads(opt.numThreads, opt.maxThreads, opt.phpIni) if err != nil { + // ErrTeardownSkipped means a prior Shutdown already tore down + // everything it could and latched the teardown-skipped flag; + // nothing new has been started in this Init call, so calling + // Shutdown here would re-enter drainPHPThreads and double-close + // the previous generation's already-closed mainThread.done. Just + // reset the running flag so ServeHTTP returns ErrNotRunning. + if errors.Is(err, ErrTeardownSkipped) { + isRunning = false + return err + } Shutdown() return err } diff --git a/frankenphp.h b/frankenphp.h index 0ea8c80f41..e3fd733778 100644 --- a/frankenphp.h +++ b/frankenphp.h @@ -46,6 +46,28 @@ static inline HRESULT LongLongSub(LONGLONG llMinuend, LONGLONG llSubtrahend, #include #include +#ifndef PHP_WIN32 +#include +#include +#endif + +/* Platform capabilities for the force-kill primitive; declared in the + * header so Go (via CGo) gets the correct struct layout too. */ +#if !defined(PHP_WIN32) && defined(SIGRTMIN) +#define FRANKENPHP_HAS_KILL_SIGNAL 1 +#define FRANKENPHP_KILL_SIGNAL (SIGRTMIN + 3) +#endif + +typedef struct { + zend_atomic_bool *vm_interrupt; + zend_atomic_bool *timed_out; +#ifdef FRANKENPHP_HAS_KILL_SIGNAL + pthread_t tid; +#elif defined(PHP_WIN32) + HANDLE thread_handle; +#endif +} force_kill_slot; + #ifndef FRANKENPHP_VERSION #define FRANKENPHP_VERSION dev #endif @@ -193,6 +215,19 @@ void frankenphp_init_thread_metrics(int max_threads); void frankenphp_destroy_thread_metrics(void); size_t frankenphp_get_thread_memory_usage(uintptr_t thread_index); +/* Best-effort force-kill primitives. The slot is populated by each PHP + * thread at boot (frankenphp_register_thread_for_kill calls back into Go + * via go_frankenphp_store_force_kill_slot) and lives in the Go-side + * phpThread. force_kill_thread interrupts the Zend VM at the next opcode + * boundary; on POSIX it also delivers SIGRTMIN+3 to the target thread, + * on Windows it calls CancelSynchronousIo + QueueUserAPC. release_thread + * drops any OS-owned resource tied to the slot (currently the Windows + * thread handle). */ +void frankenphp_set_shutdown_in_progress(bool v); +void frankenphp_register_thread_for_kill(uintptr_t thread_index); +void frankenphp_force_kill_thread(force_kill_slot slot); +void frankenphp_release_thread_for_kill(force_kill_slot slot); + void register_extensions(zend_module_entry **m, int len); #endif diff --git a/internal/state/state.go b/internal/state/state.go index f8d2b3acb7..45464c744b 100644 --- a/internal/state/state.go +++ b/internal/state/state.go @@ -35,6 +35,13 @@ const ( Rebooting // C thread has exited and ZTS state is cleaned up, ready to spawn a new C thread RebootReady + + // Abandoned is set by RestartWorkers on threads that did not yield + // within the force-kill deadline. Handlers treat it like ShuttingDown + // on the next callback, so an abandoned thread that eventually + // unwinds exits cleanly instead of re-entering the serve loop with + // stale request state. + Abandoned ) func (s State) String() string { @@ -67,6 +74,8 @@ func (s State) String() string { return "rebooting" case RebootReady: return "reboot ready" + case Abandoned: + return "abandoned" default: return "unknown" } @@ -172,8 +181,11 @@ func (ts *ThreadState) WaitFor(states ...State) { func (ts *ThreadState) RequestSafeStateChange(nextState State) bool { ts.mu.Lock() switch ts.currentState { - // disallow state changes if shutting down or done - case ShuttingDown, Done, Reserved: + // disallow state changes if shutting down, done, reserved, or abandoned. + // Abandoned threads transition to Done when they eventually unwind and + // never pass through Ready/Inactive/ShuttingDown, so waiting for a safe + // state would park forever. + case ShuttingDown, Done, Reserved, Abandoned: ts.mu.Unlock() return false @@ -187,8 +199,14 @@ func (ts *ThreadState) RequestSafeStateChange(nextState State) bool { } ts.mu.Unlock() - // wait for the state to change to a safe state - ts.WaitFor(Ready, Inactive, ShuttingDown) + // Wait for the state to become safe to transition from, or to reach a + // terminal state. Done and Abandoned are included because a thread can + // race here concurrently with a RestartWorkers that marks it Abandoned, + // or with its own teardown that ends in Done; without them in the set + // this goroutine would park forever waiting for Ready/Inactive/ + // ShuttingDown that will never come. The recursive call below re-enters + // the reject branch on Done/Abandoned and returns false. + ts.WaitFor(Ready, Inactive, ShuttingDown, Done, Abandoned) return ts.RequestSafeStateChange(nextState) } diff --git a/phpmainthread.go b/phpmainthread.go index 7f9b8fb947..08473c5416 100644 --- a/phpmainthread.go +++ b/phpmainthread.go @@ -6,15 +6,38 @@ package frankenphp // #include import "C" import ( + "errors" "log/slog" "strings" "sync" + "sync/atomic" + "time" "github.com/dunglas/frankenphp/internal/memory" "github.com/dunglas/frankenphp/internal/phpheaders" "github.com/dunglas/frankenphp/internal/state" ) +// ErrTeardownSkipped is returned by Init when a prior Shutdown skipped +// SAPI/TSRM teardown because abandoned threads did not exit within +// mainThreadShutdownDeadline. PHP's runtime state was left live in that +// case and is not safe to initialise on top of; the process must be +// restarted. See RestartWorkers' doc for the full chain. +var ErrTeardownSkipped = errors.New("cannot Init: prior Shutdown skipped SAPI/TSRM teardown because abandoned PHP threads did not exit; process must be restarted") + +// teardownSkipped is set by go_frankenphp_can_teardown when its bounded +// wait on lingeringThreads expires. Once true it stays true for the +// lifetime of the process: there is no safe point at which a never- +// torn-down SAPI/TSRM becomes re-initialisable. +var teardownSkipped atomic.Bool + +// mainThreadShutdownDeadline caps how long php_main waits for abandoned +// threads to unwind before running sapi_shutdown/tsrm_shutdown. When the +// wait expires we skip SAPI/TSRM teardown entirely: a late-unwinding +// abandoned thread would otherwise touch freed TSRM storage via +// ts_free_thread or the SAPI via php_request_shutdown in zend_catch. +const mainThreadShutdownDeadline = 5 * time.Second + // represents the main PHP thread // the thread needs to keep running as long as all other threads are running type phpMainThread struct { @@ -34,7 +57,40 @@ var ( // initPHPThreads starts the main PHP thread, // a fixed number of inactive PHP threads // and reserves a fixed number of possible PHP threads +// +// Precondition: every prior initPHPThreads call must be paired with a +// drainPHPThreads before calling this function again. Calling Init twice +// without an intervening Shutdown blocks here forever because the first +// generation's C threads are still incrementing lingeringThreads. +// +// Returns ErrTeardownSkipped if a prior Shutdown's bounded wait on +// abandoned threads expired and SAPI/TSRM teardown was skipped. The +// library refuses to re-initialise on top of never-torn-down PHP state +// rather than silently corrupt the new generation; the caller must +// terminate the process. func initPHPThreads(numThreads int, numMaxThreads int, phpIni map[string]string) (*phpMainThread, error) { + // A prior Shutdown skipped SAPI/TSRM teardown because abandoned + // threads did not exit in time. PHP's runtime state is still live + // from that generation and sapi_startup/tsrm_startup would either + // double-initialise or corrupt it. Fail loudly here instead of + // trusting docs to keep embedders away from this path. + if teardownSkipped.Load() { + return nil, ErrTeardownSkipped + } + + // Block until every C thread from a prior Init/Shutdown cycle has + // finished unwinding. Abandoned threads (Shutdown timed out without + // force-kill being able to interrupt a blocking syscall) are the + // reason for this: if we reassigned phpThreads / reallocated + // thread_metrics while one was still alive, its pending callbacks + // (ub_write, write_headers, on_thread_shutdown, etc.) would index + // the new generation's structures and corrupt them. + lingeringThreads.Wait() + + // Clear the shutdown flag so the unhealthy-restart guard in C can + // respawn threads again for this Init cycle. + C.frankenphp_set_shutdown_in_progress(false) + mainThread = &phpMainThread{ state: state.NewThreadState(), done: make(chan struct{}), @@ -54,6 +110,11 @@ func initPHPThreads(numThreads int, numMaxThreads int, phpIni map[string]string) return nil, err } + // maxThreads is only final after start() returns, because + // setAutomaticMaxThreads runs on the main PHP thread before it signals + // Ready. That's why this call sits below start() rather than above it. + // Frees the prior generation's thread_metrics if any; first call sees + // a zero-initialised global (free(NULL) is defined as a no-op in C). C.frankenphp_init_thread_metrics(C.int(mainThread.maxThreads)) // initialize all other threads @@ -75,10 +136,36 @@ func initPHPThreads(numThreads int, numMaxThreads int, phpIni map[string]string) return mainThread, nil } +// drainPHPThreads tears down the PHP thread pool without guaranteeing +// every thread has actually exited. Abandoned threads (phpThread.shutdown +// gave up after the force-kill deadline, because the blocked syscall is +// not interruptible) stay alive until their syscall unwinds naturally. +// +// Consequences, both intentional: +// - Force-kill slots are NOT re-released here; each thread clears its +// own slot under the per-thread write lock before ts_free_thread. +// A second release would double-CloseHandle on Windows and bypass +// the lock discipline elsewhere. +// - thread_metrics and phpThreads are NOT freed/nilled. An abandoned +// thread that finally unwinds still calls through the SAPI and +// lifecycle callbacks which index phpThreads[idx] and write into +// thread_metrics[idx]. Leaving them allocated keeps those late +// writes safe; reallocation happens in the next initPHPThreads +// after lingeringThreads.Wait() confirms every prior-generation +// thread has fully exited. func drainPHPThreads() { if mainThread == nil { return // mainThread was never initialized } + // Idempotent: post-drain mainThread.state is Reserved. A second call + // (e.g. from Init's cleanup path after a failed retry) would otherwise + // double-close mainThread.done and panic. + if mainThread.state.Is(state.Reserved) { + return + } + // Tell C we're tearing down: the unhealthy-thread restart path + // checks this flag and refuses to respawn past this point. + C.frankenphp_set_shutdown_in_progress(true) doneWG := sync.WaitGroup{} doneWG.Add(len(phpThreads)) mainThread.state.Set(state.ShuttingDown) @@ -99,8 +186,8 @@ func drainPHPThreads() { doneWG.Wait() mainThread.state.Set(state.Done) mainThread.state.WaitFor(state.Reserved) - C.frankenphp_destroy_thread_metrics() - phpThreads = nil + // thread_metrics / phpThreads are intentionally left allocated + // here; see the function doc. } func (mainThread *phpMainThread) start() error { @@ -170,6 +257,42 @@ func (mainThread *phpMainThread) setAutomaticMaxThreads() { } } +//export go_frankenphp_can_teardown +func go_frankenphp_can_teardown() C.bool { + // Called by php_main right after go_frankenphp_main_thread_is_ready + // returns and before sapi_shutdown/tsrm_shutdown. A drainPHPThreads + // that abandoned threads (force-kill could not interrupt a blocking + // syscall) has set mainThread.state.Done but left the abandoned C + // threads alive; running SAPI/TSRM teardown now would race their + // eventual unwind (ts_free_thread, php_request_shutdown in zend_catch, + // ub_write/write_headers callbacks) against torn-down state. + // + // Bounded wait on lingeringThreads: return true if every C thread has + // exited within the deadline (safe to tear down). On timeout return + // false so the caller skips teardown entirely; process exit will + // reclaim the leaked SAPI/TSRM state. + done := make(chan struct{}) + go func() { + lingeringThreads.Wait() + close(done) + }() + + select { + case <-done: + return C.bool(true) + case <-time.After(mainThreadShutdownDeadline): + // Latch so a subsequent Init fails with ErrTeardownSkipped + // rather than calling sapi_startup on top of never-torn-down + // state. Once set the flag stays set for the process lifetime. + teardownSkipped.Store(true) + if globalLogger.Enabled(globalCtx, slog.LevelWarn) { + globalLogger.LogAttrs(globalCtx, slog.LevelWarn, + "abandoned PHP threads did not exit before main-thread teardown; skipping SAPI/TSRM shutdown to avoid use-after-teardown") + } + return C.bool(false) + } +} + //export go_frankenphp_shutdown_main_thread func go_frankenphp_shutdown_main_thread() { mainThread.state.Set(state.Reserved) diff --git a/phpmainthread_test.go b/phpmainthread_test.go index d274991863..f244505cdd 100644 --- a/phpmainthread_test.go +++ b/phpmainthread_test.go @@ -35,7 +35,10 @@ func TestStartAndStopTheMainThreadWithOneInactiveThread(t *testing.T) { drainPHPThreads() - assert.Nil(t, phpThreads) + // phpThreads is intentionally kept allocated after drain so late + // callbacks from abandoned C threads can still index into it; see + // drainPHPThreads' doc. Verify each thread reached Done instead. + assert.True(t, phpThreads[0].state.Is(state.Done)) } func TestTransitionRegularThreadToWorkerThread(t *testing.T) { @@ -60,7 +63,7 @@ func TestTransitionRegularThreadToWorkerThread(t *testing.T) { assert.Len(t, worker.threads, 0) drainPHPThreads() - assert.Nil(t, phpThreads) + assert.True(t, phpThreads[0].state.Is(state.Done)) } func TestTransitionAThreadBetween2DifferentWorkers(t *testing.T) { @@ -86,7 +89,7 @@ func TestTransitionAThreadBetween2DifferentWorkers(t *testing.T) { assert.Len(t, secondWorker.threads, 1) drainPHPThreads() - assert.Nil(t, phpThreads) + assert.True(t, phpThreads[0].state.Is(state.Done)) } // try all possible handler transitions @@ -180,7 +183,7 @@ func TestFinishBootingAWorkerScript(t *testing.T) { ) drainPHPThreads() - assert.Nil(t, phpThreads) + assert.True(t, phpThreads[0].state.Is(state.Done)) } func TestReturnAnErrorIf2WorkersHaveTheSameFileName(t *testing.T) { diff --git a/phpthread.go b/phpthread.go index a941de9348..86680366c8 100644 --- a/phpthread.go +++ b/phpthread.go @@ -5,9 +5,11 @@ package frankenphp import "C" import ( "context" + "log/slog" "runtime" "sync" "sync/atomic" + "time" "unsafe" "github.com/dunglas/frankenphp/internal/state" @@ -25,6 +27,14 @@ type phpThread struct { contextMu sync.RWMutex state *state.ThreadState requestCount atomic.Int64 + // forceKill is populated by go_frankenphp_store_force_kill_slot from + // the PHP thread's own TSRM context at boot. Read by other goroutines + // via RestartWorkers/DrainWorkers; forceKillMu serialises reads with + // the thread-shutdown clear path so a concurrent force-kill cannot + // dereference the captured &EG() pointers after ts_free_thread() has + // freed the underlying TSRM memory on the PHP thread. + forceKillMu sync.RWMutex + forceKill C.force_kill_slot } // threadHandler defines how the callbacks from the C thread should be handled @@ -86,14 +96,46 @@ func (thread *phpThread) reboot() bool { // shutdown the underlying PHP thread func (thread *phpThread) shutdown() { if !thread.state.RequestSafeStateChange(state.ShuttingDown) { - // already shutting down or done, wait for the C thread to finish - thread.state.WaitFor(state.Done, state.Reserved) + // Already shutting down, done, reserved, or abandoned: wait for + // a terminal state. Abandoned counts as terminal here because an + // abandoned C thread either never unwinds (stuck in an + // uninterruptible syscall) or eventually transitions to Done via + // on_thread_shutdown; Shutdown must not block on it. + thread.state.WaitFor(state.Done, state.Reserved, state.Abandoned) return } close(thread.drainChan) - thread.state.WaitFor(state.Done) + + // Bounded wait: grace period, then force-kill, then abandon. Without + // this, a thread stuck in an uninterruptible blocking syscall (e.g. + // PHP usleep on Windows, any syscall on macOS where force-kill is a + // no-op) would hang Shutdown forever. + done := make(chan struct{}) + go func() { + thread.state.WaitFor(state.Done) + close(done) + }() + select { + case <-done: + case <-time.After(drainGracePeriod): + // Read lock serialises with go_frankenphp_clear_force_kill_slot, + // which zeroes the slot right before ts_free_thread() frees its + // TSRM backing storage. + thread.forceKillMu.RLock() + C.frankenphp_force_kill_thread(thread.forceKill) + thread.forceKillMu.RUnlock() + select { + case <-done: + case <-time.After(forceKillDeadline): + if globalLogger.Enabled(globalCtx, slog.LevelWarn) { + globalLogger.LogAttrs(globalCtx, slog.LevelWarn, + "PHP thread did not exit after force-kill; abandoning to unblock Shutdown") + } + } + } + thread.drainChan = make(chan struct{}) // threads go back to the reserved state from which they can be booted again @@ -203,10 +245,64 @@ func go_frankenphp_after_script_execution(threadIndex C.uintptr_t, exitStatus C. thread.Unpin() } +// lingeringThreads counts native PHP threads that have been spawned and +// not yet completed their teardown callback. initPHPThreads waits on it +// so a re-Init after a Shutdown that abandoned a thread (because +// force-kill could not interrupt a blocking syscall) cannot reassign +// phpThreads / reallocate thread_metrics out from under the abandoned +// thread's pending callbacks. Each C thread does one Add at entry to +// php_thread (via go_frankenphp_thread_spawned) and one Done before +// returning (via go_frankenphp_thread_exited). +var lingeringThreads sync.WaitGroup + +//export go_frankenphp_thread_spawned +func go_frankenphp_thread_spawned() { + lingeringThreads.Add(1) +} + +//export go_frankenphp_thread_exited +func go_frankenphp_thread_exited() { + lingeringThreads.Done() +} + +//export go_frankenphp_store_force_kill_slot +func go_frankenphp_store_force_kill_slot(threadIndex C.uintptr_t, slot C.force_kill_slot) { + thread := phpThreads[threadIndex] + // Take the write lock: an unhealthy-restart respawn races a concurrent + // RestartWorkers/Shutdown that reads the slot under RLock. Without + // this lock, the release+overwrite below could race with a reader + // and/or tear the struct mid-write. + thread.forceKillMu.Lock() + // Release any resource (Windows thread HANDLE) tied to the previous + // slot: a phpThread can reboot (max_requests, unhealthy restart) and + // register a fresh DuplicateHandle each time. + C.frankenphp_release_thread_for_kill(thread.forceKill) + thread.forceKill = slot + thread.forceKillMu.Unlock() +} + +//export go_frankenphp_clear_force_kill_slot +func go_frankenphp_clear_force_kill_slot(threadIndex C.uintptr_t) { + // Called from C right before ts_free_thread() on both the healthy + // and unhealthy thread-exit paths. Clearing the slot here (rather + // than in on_thread_shutdown, which runs after ts_free_thread) means + // the &EG(vm_interrupt) / &EG(timed_out) pointers are swapped out + // for nils BEFORE the TSRM storage they point at is freed, so any + // concurrent force-kill delivery either ran to completion before we + // took the write lock, or sees a zero-valued slot and early-returns. + thread := phpThreads[threadIndex] + thread.forceKillMu.Lock() + C.frankenphp_release_thread_for_kill(thread.forceKill) + thread.forceKill = C.force_kill_slot{} + thread.forceKillMu.Unlock() +} + //export go_frankenphp_on_thread_shutdown func go_frankenphp_on_thread_shutdown(threadIndex C.uintptr_t) { thread := phpThreads[threadIndex] thread.Unpin() + // Force-kill slot is already cleared by go_frankenphp_clear_force_kill_slot + // before ts_free_thread; nothing to do here except the state signal. if thread.state.Is(state.Rebooting) { thread.state.Set(state.RebootReady) } else { diff --git a/scaling.go b/scaling.go index c606925fdf..3d43d7e183 100644 --- a/scaling.go +++ b/scaling.go @@ -213,8 +213,13 @@ func deactivateThreads() { for i := len(autoScaledThreads) - 1; i >= 0; i-- { thread := autoScaledThreads[i] - // the thread might have been stopped otherwise, remove it - if thread.state.Is(state.Reserved) { + // The thread might have been stopped otherwise, remove it. + // Reserved = deactivated through the normal path. Abandoned = + // RestartWorkers gave up on a stuck syscall and marked it so; + // after it finally unwinds it transitions to Done. Without + // dropping those entries here an abandoned auto-scaled thread + // would permanently occupy a global scaling slot. + if thread.state.Is(state.Reserved) || thread.state.Is(state.Done) || thread.state.Is(state.Abandoned) { autoScaledThreads = append(autoScaledThreads[:i], autoScaledThreads[i+1:]...) continue } diff --git a/testdata/worker-sleep.php b/testdata/worker-sleep.php new file mode 100644 index 0000000000..20eeb61bf8 --- /dev/null +++ b/testdata/worker-sleep.php @@ -0,0 +1,21 @@ + 0 { + for _, w := range workers { + w.threadMutex.Lock() + next := w.threads[:0] + for _, t := range w.threads { + if _, ab := abandonedSet[t]; !ab { + next = append(next, t) + } + } + w.threads = next + w.threadMutex.Unlock() + } + + if globalLogger.Enabled(globalCtx, slog.LevelError) { + globalLogger.LogAttrs(globalCtx, slog.LevelError, + errIncompleteRestart.Error(), + slog.Int("abandoned", len(abandoned)), + slog.Int("restarted", len(threadsToRestart)-len(abandoned))) + } + return fmt.Errorf("%w (abandoned=%d, restarted=%d)", + errIncompleteRestart, len(abandoned), len(threadsToRestart)-len(abandoned)) + } + return nil } func (worker *worker) attachThread(thread *phpThread) { diff --git a/worker_test.go b/worker_test.go index 3fd2d63f94..7cda695c2d 100644 --- a/worker_test.go +++ b/worker_test.go @@ -9,13 +9,18 @@ import ( "net/http" "net/http/httptest" "net/url" + "os" + "path/filepath" + "runtime" "strconv" "strings" "sync" "testing" + "time" "github.com/dunglas/frankenphp" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestWorker(t *testing.T) { @@ -45,6 +50,87 @@ func TestWorker(t *testing.T) { }, &testOptions{workerScript: "worker.php", nbWorkers: 1, nbParallelRequests: 1}) } +// TestRestartWorkersForceKillsStuckThread verifies that the drain path used +// by RestartWorkers and DrainWorkers does not hang indefinitely when a +// worker thread is stuck inside a blocking PHP call (sleep, synchronous +// I/O, etc.). The force-kill primitive delivers a realtime signal to the +// thread on Linux/FreeBSD (interrupts the syscall with EINTR) or calls +// CancelSynchronousIo + QueueUserAPC on Windows. macOS has no realtime +// signal exposed to user-space, so a thread stuck in sleep() cannot be +// force-unblocked there; skip the test. +func TestRestartWorkersForceKillsStuckThread(t *testing.T) { + if runtime.GOOS != "linux" && runtime.GOOS != "freebsd" && runtime.GOOS != "windows" { + t.Skipf("force-kill cannot interrupt blocking syscalls on %s", runtime.GOOS) + } + + cwd, _ := os.Getwd() + testDataDir := cwd + "/testdata/" + + require.NoError(t, frankenphp.Init( + frankenphp.WithWorkers("sleep-worker", testDataDir+"worker-sleep.php", 1), + frankenphp.WithNumThreads(2), + )) + t.Cleanup(frankenphp.Shutdown) + + // Marker the worker touches right before sleep(). Using a unique path + // per test run avoids any leftover file fooling the poll below. + markerFile := filepath.Join(t.TempDir(), "sleep-worker-in-sleep") + + // Fire a request the worker will handle and then block on (sleep 60s). + // When the drain runs, the worker script is inside the handler callback, + // below frankenphp_handle_request, so the drain signal on drainChan + // can't be observed until the blocking sleep returns. We keep the + // recorder so we can assert the echo after sleep() is never reached, + // which would indicate force-kill failed to interrupt. + recorder := httptest.NewRecorder() + served := make(chan struct{}) + go func() { + defer close(served) + req := httptest.NewRequest("GET", "http://example.com/worker-sleep.php", nil) + req.Header.Set("Sleep-Marker", markerFile) + fr, err := frankenphp.NewRequestWithContext(req, frankenphp.WithRequestDocumentRoot(testDataDir, false)) + if err != nil { + return + } + _ = frankenphp.ServeHTTP(recorder, fr) + }() + + // Wait for the worker to actually enter sleep() - otherwise the drain + // might go through the normal drainChan path and never exercise the + // force-kill code we're trying to test. + require.Eventually(t, func() bool { + _, err := os.Stat(markerFile) + return err == nil + }, 5*time.Second, 10*time.Millisecond, "worker never entered sleep()") + + // RestartWorkers must complete within the grace period + a bit of slack. + // Without force-kill, it would wait for the 60s sleep to return. + start := time.Now() + err := frankenphp.RestartWorkers() + elapsed := time.Since(start) + + // Grace period is 5s; allow margin for SIGRTMIN+3 dispatch, PHP VM + // tick, and the drain's final ready.Wait() plus the restart loop. + const budget = 8 * time.Second + assert.Less(t, elapsed, budget, "drain must force-kill the stuck thread within the grace period") + // Force-kill should wake sleep() via SIGRTMIN+3 on Linux/FreeBSD and + // via CancelSynchronousIo on Windows alertable waits, so the restart + // must report no abandonment. + assert.NoError(t, err, "force-kill should wake the stuck thread and let it restart fully") + + // Wait for the served goroutine to finish, then assert that the + // "should not reach" line after sleep() never ran - if it did, the + // force-kill didn't actually interrupt the PHP VM, only the drain + // signal did. + select { + case <-served: + case <-time.After(2 * time.Second): + t.Fatal("server request goroutine did not complete after drain") + } + assert.NotContains(t, recorder.Body.String(), "should not reach", + "force-kill should have unwound before the echo after sleep(); if this fires, the VM interrupt was never observed") +} + func TestWorkerDie(t *testing.T) { runTest(t, func(handler func(http.ResponseWriter, *http.Request), _ *httptest.Server, i int) { req := httptest.NewRequest("GET", "http://example.com/die.php", nil)