diff --git a/src/daemon/admin/index.ts b/src/daemon/admin/index.ts index 5adafe8..3940327 100644 --- a/src/daemon/admin/index.ts +++ b/src/daemon/admin/index.ts @@ -1,5 +1,5 @@ import "websocket-polyfill"; -import NDK, { NDKEvent, NDKKind, NDKPrivateKeySigner, NDKRpcRequest, NDKRpcResponse, NDKUser, NostrEvent } from '@nostr-dev-kit/ndk'; +import NDK, { NDKEvent, NDKKind, NDKPrivateKeySigner, NDKRpcRequest, NDKRpcResponse, NDKUser } from '@nostr-dev-kit/ndk'; import { NDKNostrRpc } from '@nostr-dev-kit/ndk'; import createDebug from 'debug'; import { Key, KeyUser } from '../run'; @@ -454,44 +454,47 @@ class AdminInterface { } } -async function pingOrDie(ndk: NDK) { - let deathTimer: NodeJS.Timeout | null = null; - - function resetDeath() { - if (deathTimer) clearTimeout(deathTimer); - deathTimer = setTimeout(() => { - console.log(`❌ No ping event received in 30 seconds. Exiting.`); - process.exit(1); - }, 50000); - } - - const self = await ndk.signer!.user(); - const sub = ndk.subscribe({ - authors: [self.pubkey], - kinds: [NDKKind.NostrConnect], - "#p": [self.pubkey] - }); - sub.on("event", (event: NDKEvent) => { - console.log(`🔔 Received ping event:`, event.created_at); - resetDeath(); - }); - sub.start(); - - resetDeath(); +/** + * Pool-status connection watchdog. Exits the daemon if every relay in + * the pool stays disconnected for longer than PARTITION_THRESHOLD_MS. + * + * Replaces the original `pingOrDie` self-echo watchdog, which published + * a kind-24133 event to its own pubkey every 20s and exited if it + * didn't see the echo within 50s. That works on public relays but + * silently breaks on single-private-relay setups: NDK 2.8.1's outbox + * model doesn't reliably route self-publishes back through the + * matching subscription, so the watchdog fires false positives and + * exits the daemon every 50s while RPCs over the same channel still + * work fine. See aiolabs/nsecbunkerd#4 + #7. + * + * The pool-status approach uses NDK's own connection-lifecycle + * tracking — `pool.connectedRelays()` reports relays in + * NDKRelayStatus.CONNECTED — which is reliable across all relay + * configurations because it doesn't depend on round-trip + * publish/subscribe. No event is published; no relay traffic. + * + * Detects partition within POLL_INTERVAL + PARTITION_THRESHOLD ms. + * Transient disconnects shorter than PARTITION_THRESHOLD don't trip + * the watchdog — useful for relays that flap or briefly drop on + * network blips. + */ +async function relayConnectionWatchdog(ndk: NDK) { + const POLL_INTERVAL_MS = 10_000; + const PARTITION_THRESHOLD_MS = 60_000; + let lastConnectedAt = Date.now(); setInterval(() => { - const event = new NDKEvent(ndk, { - kind: NDKKind.NostrConnect, - tags: [ ["p", self.pubkey] ], - content: "ping" - } as NostrEvent); - event.publish().then(() => { - console.log(`🔔 Sent ping event:`, event.created_at); - }).catch((e: any) => { - console.log(`❌ Failed to send ping event:`, e.message); + const connectedCount = ndk.pool.connectedRelays().length; + if (connectedCount > 0) { + lastConnectedAt = Date.now(); + return; + } + const elapsed = Date.now() - lastConnectedAt; + if (elapsed > PARTITION_THRESHOLD_MS) { + console.log(`❌ No connected relays for ${Math.floor(elapsed / 1000)}s. Exiting.`); process.exit(1); - }); - }, 20000); + } + }, POLL_INTERVAL_MS); } export default AdminInterface;