From 1792bc489c5624e305aa546c40aeda81bc6994c8 Mon Sep 17 00:00:00 2001 From: Padreug Date: Wed, 27 May 2026 20:42:43 +0200 Subject: [PATCH] fix(#4): replace pingOrDie self-echo watchdog with pool-status check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original watchdog published a kind-24133 event to its own pubkey every 20s and exited if no echo arrived within 50s. On a single private relay setup (LNbits's nostrrelay extension channel), NDK 2.8.1's outbox model doesn't reliably route self-publishes back through the matching subscription, so the watchdog fires false positives and exits every 50s even though admin RPCs over the same channel still work fine. The upstream patches we landed previously (commit 42dbbd7) commented the call out as an emergency stopgap; this commit replaces the mechanism with one that actually answers the right question. Pool-status watchdog: poll `ndk.pool.connectedRelays().length` every 10s, track the most recent moment any relay was connected, exit if no relay has been connected for 60s. Uses NDK's own connection-lifecycle tracking which works reliably across all relay configurations — no self-publish, no subscription dependency, no relay traffic. Same intent as pingOrDie (detect partition from relay layer and let the supervisor restart us), reliable signal. Call site re-enable + env-flag opt-out follow in the next commit. Drops the now-unused NostrEvent import. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/daemon/admin/index.ts | 75 ++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 36 deletions(-) diff --git a/src/daemon/admin/index.ts b/src/daemon/admin/index.ts index 5adafe8..3940327 100644 --- a/src/daemon/admin/index.ts +++ b/src/daemon/admin/index.ts @@ -1,5 +1,5 @@ import "websocket-polyfill"; -import NDK, { NDKEvent, NDKKind, NDKPrivateKeySigner, NDKRpcRequest, NDKRpcResponse, NDKUser, NostrEvent } from '@nostr-dev-kit/ndk'; +import NDK, { NDKEvent, NDKKind, NDKPrivateKeySigner, NDKRpcRequest, NDKRpcResponse, NDKUser } from '@nostr-dev-kit/ndk'; import { NDKNostrRpc } from '@nostr-dev-kit/ndk'; import createDebug from 'debug'; import { Key, KeyUser } from '../run'; @@ -454,44 +454,47 @@ class AdminInterface { } } -async function pingOrDie(ndk: NDK) { - let deathTimer: NodeJS.Timeout | null = null; - - function resetDeath() { - if (deathTimer) clearTimeout(deathTimer); - deathTimer = setTimeout(() => { - console.log(`❌ No ping event received in 30 seconds. Exiting.`); - process.exit(1); - }, 50000); - } - - const self = await ndk.signer!.user(); - const sub = ndk.subscribe({ - authors: [self.pubkey], - kinds: [NDKKind.NostrConnect], - "#p": [self.pubkey] - }); - sub.on("event", (event: NDKEvent) => { - console.log(`🔔 Received ping event:`, event.created_at); - resetDeath(); - }); - sub.start(); - - resetDeath(); +/** + * Pool-status connection watchdog. Exits the daemon if every relay in + * the pool stays disconnected for longer than PARTITION_THRESHOLD_MS. + * + * Replaces the original `pingOrDie` self-echo watchdog, which published + * a kind-24133 event to its own pubkey every 20s and exited if it + * didn't see the echo within 50s. That works on public relays but + * silently breaks on single-private-relay setups: NDK 2.8.1's outbox + * model doesn't reliably route self-publishes back through the + * matching subscription, so the watchdog fires false positives and + * exits the daemon every 50s while RPCs over the same channel still + * work fine. See aiolabs/nsecbunkerd#4 + #7. + * + * The pool-status approach uses NDK's own connection-lifecycle + * tracking — `pool.connectedRelays()` reports relays in + * NDKRelayStatus.CONNECTED — which is reliable across all relay + * configurations because it doesn't depend on round-trip + * publish/subscribe. No event is published; no relay traffic. + * + * Detects partition within POLL_INTERVAL + PARTITION_THRESHOLD ms. + * Transient disconnects shorter than PARTITION_THRESHOLD don't trip + * the watchdog — useful for relays that flap or briefly drop on + * network blips. + */ +async function relayConnectionWatchdog(ndk: NDK) { + const POLL_INTERVAL_MS = 10_000; + const PARTITION_THRESHOLD_MS = 60_000; + let lastConnectedAt = Date.now(); setInterval(() => { - const event = new NDKEvent(ndk, { - kind: NDKKind.NostrConnect, - tags: [ ["p", self.pubkey] ], - content: "ping" - } as NostrEvent); - event.publish().then(() => { - console.log(`🔔 Sent ping event:`, event.created_at); - }).catch((e: any) => { - console.log(`❌ Failed to send ping event:`, e.message); + const connectedCount = ndk.pool.connectedRelays().length; + if (connectedCount > 0) { + lastConnectedAt = Date.now(); + return; + } + const elapsed = Date.now() - lastConnectedAt; + if (elapsed > PARTITION_THRESHOLD_MS) { + console.log(`❌ No connected relays for ${Math.floor(elapsed / 1000)}s. Exiting.`); process.exit(1); - }); - }, 20000); + } + }, POLL_INTERVAL_MS); } export default AdminInterface;