Compare commits

...

2 commits

Author SHA1 Message Date
fb1c239e15 fix(#4): re-enable connection watchdog with env-flag opt-out
Some checks failed
Docker image / build-and-push-image (push) Has been cancelled
Calls `relayConnectionWatchdog` (introduced in the previous commit) at
the end of admin-interface connect(). Gated by NSEC_BUNKER_DISABLE_WATCHDOG=1
for operators who run external liveness checks (Prometheus probes, k8s
readiness, etc.) and don't want the daemon to self-terminate.

This restores the watchdog behavior that was commented out in commit
42dbbd7 (the emergency stopgap for the old self-echo false positives),
but on top of the now-reliable pool-status mechanism.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 20:43:12 +02:00
1792bc489c fix(#4): replace pingOrDie self-echo watchdog with pool-status check
The original watchdog published a kind-24133 event to its own pubkey
every 20s and exited if no echo arrived within 50s. On a single private
relay setup (LNbits's nostrrelay extension channel), NDK 2.8.1's outbox
model doesn't reliably route self-publishes back through the matching
subscription, so the watchdog fires false positives and exits every 50s
even though admin RPCs over the same channel still work fine. The
upstream patches we landed previously (commit 42dbbd7) commented the
call out as an emergency stopgap; this commit replaces the mechanism
with one that actually answers the right question.

Pool-status watchdog: poll `ndk.pool.connectedRelays().length` every
10s, track the most recent moment any relay was connected, exit if no
relay has been connected for 60s. Uses NDK's own connection-lifecycle
tracking which works reliably across all relay configurations — no
self-publish, no subscription dependency, no relay traffic. Same intent
as pingOrDie (detect partition from relay layer and let the supervisor
restart us), reliable signal.

Call site re-enable + env-flag opt-out follow in the next commit.

Drops the now-unused NostrEvent import.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 20:42:43 +02:00

View file

@ -1,5 +1,5 @@
import "websocket-polyfill"; import "websocket-polyfill";
import NDK, { NDKEvent, NDKKind, NDKPrivateKeySigner, NDKRpcRequest, NDKRpcResponse, NDKUser, NostrEvent } from '@nostr-dev-kit/ndk'; import NDK, { NDKEvent, NDKKind, NDKPrivateKeySigner, NDKRpcRequest, NDKRpcResponse, NDKUser } from '@nostr-dev-kit/ndk';
import { NDKNostrRpc } from '@nostr-dev-kit/ndk'; import { NDKNostrRpc } from '@nostr-dev-kit/ndk';
import createDebug from 'debug'; import createDebug from 'debug';
import { Key, KeyUser } from '../run'; import { Key, KeyUser } from '../run';
@ -168,12 +168,17 @@ class AdminInterface {
this.handleRequest(req); this.handleRequest(req);
}); });
// pingOrDie disabled — NDK 2.8.1 outbox model doesn't echo // Connection watchdog: exit if pool reports no connected relays
// self-published events back through subscriptions on // for >60s so the process supervisor (systemd / docker restart
// non-public relay channels, so the watchdog fires false // policy / k8s) can recover. Replaces the original self-echo
// positives and exits the bunker every 50s on private relays. // pingOrDie — see relayConnectionWatchdog comment + #4 + #7.
// See aiolabs/nsecbunkerd#4 + #7. // Operators with external liveness checking can disable via
// pingOrDie(this.ndk); // NSEC_BUNKER_DISABLE_WATCHDOG=1.
if (process.env.NSEC_BUNKER_DISABLE_WATCHDOG !== '1') {
relayConnectionWatchdog(this.ndk);
} else {
console.log('⏸ watchdog disabled via NSEC_BUNKER_DISABLE_WATCHDOG=1');
}
}).catch((err) => { }).catch((err) => {
console.log('❌ admin connection failed'); console.log('❌ admin connection failed');
console.log(err); console.log(err);
@ -454,44 +459,47 @@ class AdminInterface {
} }
} }
async function pingOrDie(ndk: NDK) { /**
let deathTimer: NodeJS.Timeout | null = null; * Pool-status connection watchdog. Exits the daemon if every relay in
* the pool stays disconnected for longer than PARTITION_THRESHOLD_MS.
function resetDeath() { *
if (deathTimer) clearTimeout(deathTimer); * Replaces the original `pingOrDie` self-echo watchdog, which published
deathTimer = setTimeout(() => { * a kind-24133 event to its own pubkey every 20s and exited if it
console.log(`❌ No ping event received in 30 seconds. Exiting.`); * didn't see the echo within 50s. That works on public relays but
process.exit(1); * silently breaks on single-private-relay setups: NDK 2.8.1's outbox
}, 50000); * model doesn't reliably route self-publishes back through the
} * matching subscription, so the watchdog fires false positives and
* exits the daemon every 50s while RPCs over the same channel still
const self = await ndk.signer!.user(); * work fine. See aiolabs/nsecbunkerd#4 + #7.
const sub = ndk.subscribe({ *
authors: [self.pubkey], * The pool-status approach uses NDK's own connection-lifecycle
kinds: [NDKKind.NostrConnect], * tracking `pool.connectedRelays()` reports relays in
"#p": [self.pubkey] * NDKRelayStatus.CONNECTED which is reliable across all relay
}); * configurations because it doesn't depend on round-trip
sub.on("event", (event: NDKEvent) => { * publish/subscribe. No event is published; no relay traffic.
console.log(`🔔 Received ping event:`, event.created_at); *
resetDeath(); * Detects partition within POLL_INTERVAL + PARTITION_THRESHOLD ms.
}); * Transient disconnects shorter than PARTITION_THRESHOLD don't trip
sub.start(); * the watchdog useful for relays that flap or briefly drop on
* network blips.
resetDeath(); */
async function relayConnectionWatchdog(ndk: NDK) {
const POLL_INTERVAL_MS = 10_000;
const PARTITION_THRESHOLD_MS = 60_000;
let lastConnectedAt = Date.now();
setInterval(() => { setInterval(() => {
const event = new NDKEvent(ndk, { const connectedCount = ndk.pool.connectedRelays().length;
kind: NDKKind.NostrConnect, if (connectedCount > 0) {
tags: [ ["p", self.pubkey] ], lastConnectedAt = Date.now();
content: "ping" return;
} as NostrEvent); }
event.publish().then(() => { const elapsed = Date.now() - lastConnectedAt;
console.log(`🔔 Sent ping event:`, event.created_at); if (elapsed > PARTITION_THRESHOLD_MS) {
}).catch((e: any) => { console.log(`❌ No connected relays for ${Math.floor(elapsed / 1000)}s. Exiting.`);
console.log(`❌ Failed to send ping event:`, e.message);
process.exit(1); process.exit(1);
}); }
}, 20000); }, POLL_INTERVAL_MS);
} }
export default AdminInterface; export default AdminInterface;