From a690596b857301ed34f0f13ec2b263bdbe2c7436 Mon Sep 17 00:00:00 2001 From: Padreug Date: Wed, 3 Jun 2026 18:55:55 +0200 Subject: [PATCH] fix(daemon): keep retrying relay reconnect indefinitely, overriding NDK give-up (#20) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NDK 3.x's per-relay connectivity machine gives up after ~3 fast-fail (ECONNREFUSED) cycles. Three sub-second failures look identical, so `isFlapping()` (std-dev < 1s) returns true and the relay transitions to FLAPPING; NDKPool's `handleFlapping` then reschedules with doubling backoff (5s → 10s → 20s → 40s → 80s …). For nsecbunkerd, "disconnected for 80+s after every lnbits restart" is the failure mode users hit on the regtest dev stack: bunker container boots before lnbits's nostrrelay extension is accepting WebSockets → ECONNREFUSED storm → NDK flagged FLAPPING → bunker stays silently deaf until manual restart. Symptom is particularly hostile because: - `relay:connect` fires optimistically; the immediate ECONNREFUSED follow-up doesn't propagate to user-facing logs. - `NSEC_BUNKER_DISABLE_WATCHDOG=1` (the dev-stack default) skips the exit-and-restart safety net. - Manual `docker compose restart nsecbunker` is the only recovery. Fix: attach a small supervisor (`attachIndefiniteReconnect`) to both NDK instances (daemon's backend NDK in run.ts, AdminInterface's admin NDK in admin/index.ts). On `relay:disconnect` or `flapping`, schedule a manual `relay.connect()` with a SHORT capped delay (1s → 2s → 4s → 8s → 10s, capped at 10s instead of NDK's unbounded doubling). Successful connect resets the attempt counter so a future disconnect storm starts fresh. Coexists cleanly with the relay-connection watchdog (admin/index.ts:500): - Brief disconnects (e.g. lnbits restart): supervisor recovers within seconds, watchdog never fires. - Persistent disconnects (relay truly down): supervisor keeps trying every ≤10s; if it can't recover within 60s, watchdog still exits and the process supervisor restarts the bunker. So the watchdog becomes a long-tail safety net; this supervisor handles the common case. Operators with `NSEC_BUNKER_DISABLE_WATCHDOG=1` set as a workaround for this bug can re-enable the watchdog once this lands. Trade-off: we may hammer a permanently-down relay every 10s. Acceptable because the bunker's primary relay is typically on the same host or LAN (loopback or docker-internal); TCP RSTs are cheap. Public-relay setups can layer external supervision on top. Verified on regtest dev stack (cold-boot race): bunker logs 🔁 admin: scheduling reconnect to ws://lnbits:5001/nostrrelay/test/ in 1000ms (attempt 1, overriding NDK give-up) 🔁 backend: scheduling reconnect to ws://lnbits:5001/nostrrelay/test/ in 1000ms (attempt 1, overriding NDK give-up) on each disconnect, where pre-fix the bunker stayed silently deaf. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/daemon/admin/index.ts | 10 +++ src/daemon/lib/relay-reconnect.ts | 101 ++++++++++++++++++++++++++++++ src/daemon/run.ts | 7 +++ 3 files changed, 118 insertions(+) create mode 100644 src/daemon/lib/relay-reconnect.ts diff --git a/src/daemon/admin/index.ts b/src/daemon/admin/index.ts index e4d8509..a918a66 100644 --- a/src/daemon/admin/index.ts +++ b/src/daemon/admin/index.ts @@ -25,6 +25,7 @@ import { validateRequestFromAdmin } from './validations/request-from-admin'; import { dmUser } from '../../utils/dm-user'; import { IConfig, getCurrentConfig } from "../../config"; import path from 'path'; +import { attachIndefiniteReconnect } from '../lib/relay-reconnect.js'; const debug = createDebug("nsecbunker:admin"); @@ -62,6 +63,15 @@ class AdminInterface { explicitRelayUrls: opts.adminRelays, signer: new NDKPrivateKeySigner(opts.key), }); + + // Override NDK's "give up after detecting flapping" behavior so the + // bunker's admin NDK keeps trying to reconnect indefinitely. The + // watchdog (when enabled) still fires after 60s of zero connected + // relays; this helper handles shorter disconnects (e.g. an lnbits + // restart that pulls the nostrrelay extension's WS for a few + // seconds) without involving the supervisor. See aiolabs/nsecbunkerd#20. + attachIndefiniteReconnect(this.ndk, 'admin'); + this.ndk.signer?.user().then((user: NDKUser) => { let connectionString = `bunker://${user.npub}`; diff --git a/src/daemon/lib/relay-reconnect.ts b/src/daemon/lib/relay-reconnect.ts new file mode 100644 index 0000000..e3fdb4f --- /dev/null +++ b/src/daemon/lib/relay-reconnect.ts @@ -0,0 +1,101 @@ +import NDK from "@nostr-dev-kit/ndk"; + +/** + * Attaches an aggressive-reconnect supervisor to an NDK instance. + * + * NDK 3.x's per-relay connectivity state machine gives up retrying after + * a few consecutive fast-fail (e.g. ECONNREFUSED returns in <1 ms) + * connection attempts: + * + * 1. Each attempt's duration is recorded in `_connectionStats.durations`. + * 2. After every 3 attempts, `isFlapping()` checks the std-dev of those + * durations against `FLAPPING_THRESHOLD_MS` (1 second). Three fast + * failures look identical → tiny std-dev → flapping=true → status + * transitions to FLAPPING and the per-relay retry stops. + * 3. `NDKPool.handleFlapping` catches the event and reschedules a + * reconnect via doubling backoff (5s → 10s → 20s → 40s → 80s …), + * growing unbounded. + * + * For nsecbunkerd, where the admin relay is typically a single relay we + * **must** stay subscribed to, "disconnected for 80+s after every dev + * restart" is the failure mode users hit (aiolabs/nsecbunkerd#20). The + * pool's doubling backoff is too pessimistic for our use case. + * + * This helper sidesteps the give-up path: when the pool emits `flapping` + * (the symptom that NDK has internally given up), or when we see the + * relay disconnect outside our own request, we manually call + * `relay.connect()` with a SHORT capped delay. Successful connect resets + * the attempt counter so a future disconnect storm doesn't grow the + * delay. + * + * Trade-off: we may hammer a permanently-down relay every 10s. That's + * fine for a bunker — being disconnected silently is strictly worse than + * a retry storm against localhost. Acceptable because: + * - The bunker's primary relay is typically on the same host or LAN + * (`ws://lnbits:5001/...`); TCP RSTs are cheap. + * - Public-relay setups can layer external supervision on top if they + * care about retry pressure. + */ +export function attachIndefiniteReconnect(ndk: NDK, label: string): void { + const RECONNECT_BASE_MS = 1_000; + const RECONNECT_CAP_MS = 10_000; + + const attempts = new Map(); + const pending = new Map(); + + const reconnectDelay = (n: number): number => + Math.min(RECONNECT_BASE_MS * 2 ** n, RECONNECT_CAP_MS); + + const scheduleReconnect = (relay: any): void => { + const url: string = relay.url; + if (pending.has(url)) return; + const n = attempts.get(url) ?? 0; + const delay = reconnectDelay(n); + console.log( + `🔁 ${label}: scheduling reconnect to ${url} in ${delay}ms ` + + `(attempt ${n + 1}, overriding NDK give-up)` + ); + const timer = setTimeout(() => { + pending.delete(url); + attempts.set(url, n + 1); + relay.connect().catch((e: any) => { + console.log( + `❌ ${label}: manual reconnect to ${url} failed: ` + + `${e?.message ?? e}` + ); + // Don't recurse here — the next 'flapping' or 'disconnect' + // event will fire and schedule another attempt. + }); + }, delay); + pending.set(url, timer); + }; + + ndk.pool.on("flapping", (relay: any) => { + console.log( + `⚠️ ${label}: NDK flagged ${relay.url} as flapping ` + + `(connectivity machine gave up internally)` + ); + scheduleReconnect(relay); + }); + + ndk.pool.on("relay:disconnect", (relay: any) => { + scheduleReconnect(relay); + }); + + ndk.pool.on("relay:connect", (relay: any) => { + const url: string = relay.url; + const n = attempts.get(url) ?? 0; + if (n > 0) { + console.log( + `✅ ${label}: recovered ${url} after ${n} manual reconnect ` + + `attempt(s)` + ); + } + attempts.delete(url); + const timer = pending.get(url); + if (timer) { + clearTimeout(timer); + pending.delete(url); + } + }); +} diff --git a/src/daemon/run.ts b/src/daemon/run.ts index 743606d..7eaa512 100644 --- a/src/daemon/run.ts +++ b/src/daemon/run.ts @@ -15,6 +15,7 @@ import FastifyView from '@fastify/view'; import Handlebars from "handlebars"; import {authorizeRequestWebHandler, processRequestWebHandler} from "./web/authorize.js"; import {processRegistrationWebHandler} from "./web/authorize.js"; +import { attachIndefiniteReconnect } from "./lib/relay-reconnect.js"; export type Key = { name: string; @@ -165,6 +166,12 @@ class Daemon { this.ndk.pool.on('relay:disconnect', (r) => { console.log(`🚫 Disconnected from ${r.url}`); }); + + // Override NDK's "give up after detecting flapping" behavior so the + // bunker's backend NDK keeps trying to reconnect indefinitely. + // Without this, an ECONNREFUSED storm at boot (relay not yet up) + // permanently strands the bunker. See aiolabs/nsecbunkerd#20. + attachIndefiniteReconnect(this.ndk, 'backend'); } async startWebAuth() { -- 2.53.0