diff --git a/src/daemon/admin/index.ts b/src/daemon/admin/index.ts index db8733b..5adafe8 100644 --- a/src/daemon/admin/index.ts +++ b/src/daemon/admin/index.ts @@ -1,5 +1,5 @@ import "websocket-polyfill"; -import NDK, { NDKEvent, NDKKind, NDKPrivateKeySigner, NDKRpcRequest, NDKRpcResponse, NDKUser } from '@nostr-dev-kit/ndk'; +import NDK, { NDKEvent, NDKKind, NDKPrivateKeySigner, NDKRpcRequest, NDKRpcResponse, NDKUser, NostrEvent } from '@nostr-dev-kit/ndk'; import { NDKNostrRpc } from '@nostr-dev-kit/ndk'; import createDebug from 'debug'; import { Key, KeyUser } from '../run'; @@ -168,17 +168,12 @@ class AdminInterface { this.handleRequest(req); }); - // Connection watchdog: exit if pool reports no connected relays - // for >60s so the process supervisor (systemd / docker restart - // policy / k8s) can recover. Replaces the original self-echo - // pingOrDie — see relayConnectionWatchdog comment + #4 + #7. - // Operators with external liveness checking can disable via - // NSEC_BUNKER_DISABLE_WATCHDOG=1. - if (process.env.NSEC_BUNKER_DISABLE_WATCHDOG !== '1') { - relayConnectionWatchdog(this.ndk); - } else { - console.log('⏸ watchdog disabled via NSEC_BUNKER_DISABLE_WATCHDOG=1'); - } + // pingOrDie disabled — NDK 2.8.1 outbox model doesn't echo + // self-published events back through subscriptions on + // non-public relay channels, so the watchdog fires false + // positives and exits the bunker every 50s on private relays. + // See aiolabs/nsecbunkerd#4 + #7. + // pingOrDie(this.ndk); }).catch((err) => { console.log('❌ admin connection failed'); console.log(err); @@ -459,47 +454,44 @@ class AdminInterface { } } -/** - * Pool-status connection watchdog. Exits the daemon if every relay in - * the pool stays disconnected for longer than PARTITION_THRESHOLD_MS. - * - * Replaces the original `pingOrDie` self-echo watchdog, which published - * a kind-24133 event to its own pubkey every 20s and exited if it - * didn't see the echo within 50s. That works on public relays but - * silently breaks on single-private-relay setups: NDK 2.8.1's outbox - * model doesn't reliably route self-publishes back through the - * matching subscription, so the watchdog fires false positives and - * exits the daemon every 50s while RPCs over the same channel still - * work fine. See aiolabs/nsecbunkerd#4 + #7. - * - * The pool-status approach uses NDK's own connection-lifecycle - * tracking — `pool.connectedRelays()` reports relays in - * NDKRelayStatus.CONNECTED — which is reliable across all relay - * configurations because it doesn't depend on round-trip - * publish/subscribe. No event is published; no relay traffic. - * - * Detects partition within POLL_INTERVAL + PARTITION_THRESHOLD ms. - * Transient disconnects shorter than PARTITION_THRESHOLD don't trip - * the watchdog — useful for relays that flap or briefly drop on - * network blips. - */ -async function relayConnectionWatchdog(ndk: NDK) { - const POLL_INTERVAL_MS = 10_000; - const PARTITION_THRESHOLD_MS = 60_000; - let lastConnectedAt = Date.now(); +async function pingOrDie(ndk: NDK) { + let deathTimer: NodeJS.Timeout | null = null; + + function resetDeath() { + if (deathTimer) clearTimeout(deathTimer); + deathTimer = setTimeout(() => { + console.log(`❌ No ping event received in 30 seconds. Exiting.`); + process.exit(1); + }, 50000); + } + + const self = await ndk.signer!.user(); + const sub = ndk.subscribe({ + authors: [self.pubkey], + kinds: [NDKKind.NostrConnect], + "#p": [self.pubkey] + }); + sub.on("event", (event: NDKEvent) => { + console.log(`🔔 Received ping event:`, event.created_at); + resetDeath(); + }); + sub.start(); + + resetDeath(); setInterval(() => { - const connectedCount = ndk.pool.connectedRelays().length; - if (connectedCount > 0) { - lastConnectedAt = Date.now(); - return; - } - const elapsed = Date.now() - lastConnectedAt; - if (elapsed > PARTITION_THRESHOLD_MS) { - console.log(`❌ No connected relays for ${Math.floor(elapsed / 1000)}s. Exiting.`); + const event = new NDKEvent(ndk, { + kind: NDKKind.NostrConnect, + tags: [ ["p", self.pubkey] ], + content: "ping" + } as NostrEvent); + event.publish().then(() => { + console.log(`🔔 Sent ping event:`, event.created_at); + }).catch((e: any) => { + console.log(`❌ Failed to send ping event:`, e.message); process.exit(1); - } - }, POLL_INTERVAL_MS); + }); + }, 20000); } export default AdminInterface;