Merge pull request 'fix(daemon): keep retrying relay reconnect indefinitely, overriding NDK give-up (#20)' (#22) from fix-20-indefinite-relay-reconnect into dev

Reviewed-on: #22
2026-06-03 16:58:48 +00:00 · 2026-06-03 16:58:48 +00:00 · dad42a7669
commit dad42a7669
parent 131f689c6f a690596b85
3 changed files with 118 additions and 0 deletions
--- a/src/daemon/admin/index.ts
+++ b/src/daemon/admin/index.ts
@ -25,6 +25,7 @@ import { validateRequestFromAdmin } from './validations/request-from-admin';
 import { dmUser } from '../../utils/dm-user';
 import { IConfig, getCurrentConfig } from "../../config";
 import path from 'path';
+import { attachIndefiniteReconnect } from '../lib/relay-reconnect.js';


 const debug = createDebug("nsecbunker:admin");
@ -62,6 +63,15 @@ class AdminInterface {
            explicitRelayUrls: opts.adminRelays,
            signer: new NDKPrivateKeySigner(opts.key),
        });
+
+        // Override NDK's "give up after detecting flapping" behavior so the
+        // bunker's admin NDK keeps trying to reconnect indefinitely. The
+        // watchdog (when enabled) still fires after 60s of zero connected
+        // relays; this helper handles shorter disconnects (e.g. an lnbits
+        // restart that pulls the nostrrelay extension's WS for a few
+        // seconds) without involving the supervisor. See aiolabs/nsecbunkerd#20.
+        attachIndefiniteReconnect(this.ndk, 'admin');
+
        this.ndk.signer?.user().then((user: NDKUser) => {
            let connectionString = `bunker://${user.npub}`;

--- a/src/daemon/lib/relay-reconnect.ts
+++ b/src/daemon/lib/relay-reconnect.ts
@ -0,0 +1,101 @@
+import NDK from "@nostr-dev-kit/ndk";
+
+/**
+ * Attaches an aggressive-reconnect supervisor to an NDK instance.
+ *
+ * NDK 3.x's per-relay connectivity state machine gives up retrying after
+ * a few consecutive fast-fail (e.g. ECONNREFUSED returns in <1 ms)
+ * connection attempts:
+ *
+ *   1. Each attempt's duration is recorded in `_connectionStats.durations`.
+ *   2. After every 3 attempts, `isFlapping()` checks the std-dev of those
+ *      durations against `FLAPPING_THRESHOLD_MS` (1 second). Three fast
+ *      failures look identical → tiny std-dev → flapping=true → status
+ *      transitions to FLAPPING and the per-relay retry stops.
+ *   3. `NDKPool.handleFlapping` catches the event and reschedules a
+ *      reconnect via doubling backoff (5s → 10s → 20s → 40s → 80s …),
+ *      growing unbounded.
+ *
+ * For nsecbunkerd, where the admin relay is typically a single relay we
+ * **must** stay subscribed to, "disconnected for 80+s after every dev
+ * restart" is the failure mode users hit (aiolabs/nsecbunkerd#20). The
+ * pool's doubling backoff is too pessimistic for our use case.
+ *
+ * This helper sidesteps the give-up path: when the pool emits `flapping`
+ * (the symptom that NDK has internally given up), or when we see the
+ * relay disconnect outside our own request, we manually call
+ * `relay.connect()` with a SHORT capped delay. Successful connect resets
+ * the attempt counter so a future disconnect storm doesn't grow the
+ * delay.
+ *
+ * Trade-off: we may hammer a permanently-down relay every 10s. That's
+ * fine for a bunker — being disconnected silently is strictly worse than
+ * a retry storm against localhost. Acceptable because:
+ *   - The bunker's primary relay is typically on the same host or LAN
+ *     (`ws://lnbits:5001/...`); TCP RSTs are cheap.
+ *   - Public-relay setups can layer external supervision on top if they
+ *     care about retry pressure.
+ */
+export function attachIndefiniteReconnect(ndk: NDK, label: string): void {
+    const RECONNECT_BASE_MS = 1_000;
+    const RECONNECT_CAP_MS = 10_000;
+
+    const attempts = new Map<string, number>();
+    const pending = new Map<string, NodeJS.Timeout>();
+
+    const reconnectDelay = (n: number): number =>
+        Math.min(RECONNECT_BASE_MS * 2 ** n, RECONNECT_CAP_MS);
+
+    const scheduleReconnect = (relay: any): void => {
+        const url: string = relay.url;
+        if (pending.has(url)) return;
+        const n = attempts.get(url) ?? 0;
+        const delay = reconnectDelay(n);
+        console.log(
+            `🔁 ${label}: scheduling reconnect to ${url} in ${delay}ms ` +
+            `(attempt ${n + 1}, overriding NDK give-up)`
+        );
+        const timer = setTimeout(() => {
+            pending.delete(url);
+            attempts.set(url, n + 1);
+            relay.connect().catch((e: any) => {
+                console.log(
+                    `❌ ${label}: manual reconnect to ${url} failed: ` +
+                    `${e?.message ?? e}`
+                );
+                // Don't recurse here — the next 'flapping' or 'disconnect'
+                // event will fire and schedule another attempt.
+            });
+        }, delay);
+        pending.set(url, timer);
+    };
+
+    ndk.pool.on("flapping", (relay: any) => {
+        console.log(
+            `⚠️  ${label}: NDK flagged ${relay.url} as flapping ` +
+            `(connectivity machine gave up internally)`
+        );
+        scheduleReconnect(relay);
+    });
+
+    ndk.pool.on("relay:disconnect", (relay: any) => {
+        scheduleReconnect(relay);
+    });
+
+    ndk.pool.on("relay:connect", (relay: any) => {
+        const url: string = relay.url;
+        const n = attempts.get(url) ?? 0;
+        if (n > 0) {
+            console.log(
+                `✅ ${label}: recovered ${url} after ${n} manual reconnect ` +
+                `attempt(s)`
+            );
+        }
+        attempts.delete(url);
+        const timer = pending.get(url);
+        if (timer) {
+            clearTimeout(timer);
+            pending.delete(url);
+        }
+    });
+}
--- a/src/daemon/run.ts
+++ b/src/daemon/run.ts
@ -15,6 +15,7 @@ import FastifyView from '@fastify/view';
 import Handlebars from "handlebars";
 import {authorizeRequestWebHandler, processRequestWebHandler} from "./web/authorize.js";
 import {processRegistrationWebHandler} from "./web/authorize.js";
+import { attachIndefiniteReconnect } from "./lib/relay-reconnect.js";

 export type Key = {
    name: string;
@ -165,6 +166,12 @@ class Daemon {
        this.ndk.pool.on('relay:disconnect', (r) => {
            console.log(`🚫 Disconnected from ${r.url}`);
        });
+
+        // Override NDK's "give up after detecting flapping" behavior so the
+        // bunker's backend NDK keeps trying to reconnect indefinitely.
+        // Without this, an ECONNREFUSED storm at boot (relay not yet up)
+        // permanently strands the bunker. See aiolabs/nsecbunkerd#20.
+        attachIndefiniteReconnect(this.ndk, 'backend');
    }

    async startWebAuth() {