Compare commits

...

2 commits

Author SHA1 Message Date
fb1c239e15 fix(#4): re-enable connection watchdog with env-flag opt-out
Some checks failed
Docker image / build-and-push-image (push) Has been cancelled
Calls `relayConnectionWatchdog` (introduced in the previous commit) at
the end of admin-interface connect(). Gated by NSEC_BUNKER_DISABLE_WATCHDOG=1
for operators who run external liveness checks (Prometheus probes, k8s
readiness, etc.) and don't want the daemon to self-terminate.

This restores the watchdog behavior that was commented out in commit
42dbbd7 (the emergency stopgap for the old self-echo false positives),
but on top of the now-reliable pool-status mechanism.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 20:43:12 +02:00
1792bc489c fix(#4): replace pingOrDie self-echo watchdog with pool-status check
The original watchdog published a kind-24133 event to its own pubkey
every 20s and exited if no echo arrived within 50s. On a single private
relay setup (LNbits's nostrrelay extension channel), NDK 2.8.1's outbox
model doesn't reliably route self-publishes back through the matching
subscription, so the watchdog fires false positives and exits every 50s
even though admin RPCs over the same channel still work fine. The
upstream patches we landed previously (commit 42dbbd7) commented the
call out as an emergency stopgap; this commit replaces the mechanism
with one that actually answers the right question.

Pool-status watchdog: poll `ndk.pool.connectedRelays().length` every
10s, track the most recent moment any relay was connected, exit if no
relay has been connected for 60s. Uses NDK's own connection-lifecycle
tracking which works reliably across all relay configurations — no
self-publish, no subscription dependency, no relay traffic. Same intent
as pingOrDie (detect partition from relay layer and let the supervisor
restart us), reliable signal.

Call site re-enable + env-flag opt-out follow in the next commit.

Drops the now-unused NostrEvent import.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 20:42:43 +02:00

View file

@ -1,5 +1,5 @@
import "websocket-polyfill";
import NDK, { NDKEvent, NDKKind, NDKPrivateKeySigner, NDKRpcRequest, NDKRpcResponse, NDKUser, NostrEvent } from '@nostr-dev-kit/ndk';
import NDK, { NDKEvent, NDKKind, NDKPrivateKeySigner, NDKRpcRequest, NDKRpcResponse, NDKUser } from '@nostr-dev-kit/ndk';
import { NDKNostrRpc } from '@nostr-dev-kit/ndk';
import createDebug from 'debug';
import { Key, KeyUser } from '../run';
@ -168,12 +168,17 @@ class AdminInterface {
this.handleRequest(req);
});
// pingOrDie disabled — NDK 2.8.1 outbox model doesn't echo
// self-published events back through subscriptions on
// non-public relay channels, so the watchdog fires false
// positives and exits the bunker every 50s on private relays.
// See aiolabs/nsecbunkerd#4 + #7.
// pingOrDie(this.ndk);
// Connection watchdog: exit if pool reports no connected relays
// for >60s so the process supervisor (systemd / docker restart
// policy / k8s) can recover. Replaces the original self-echo
// pingOrDie — see relayConnectionWatchdog comment + #4 + #7.
// Operators with external liveness checking can disable via
// NSEC_BUNKER_DISABLE_WATCHDOG=1.
if (process.env.NSEC_BUNKER_DISABLE_WATCHDOG !== '1') {
relayConnectionWatchdog(this.ndk);
} else {
console.log('⏸ watchdog disabled via NSEC_BUNKER_DISABLE_WATCHDOG=1');
}
}).catch((err) => {
console.log('❌ admin connection failed');
console.log(err);
@ -454,44 +459,47 @@ class AdminInterface {
}
}
async function pingOrDie(ndk: NDK) {
let deathTimer: NodeJS.Timeout | null = null;
function resetDeath() {
if (deathTimer) clearTimeout(deathTimer);
deathTimer = setTimeout(() => {
console.log(`❌ No ping event received in 30 seconds. Exiting.`);
process.exit(1);
}, 50000);
}
const self = await ndk.signer!.user();
const sub = ndk.subscribe({
authors: [self.pubkey],
kinds: [NDKKind.NostrConnect],
"#p": [self.pubkey]
});
sub.on("event", (event: NDKEvent) => {
console.log(`🔔 Received ping event:`, event.created_at);
resetDeath();
});
sub.start();
resetDeath();
/**
* Pool-status connection watchdog. Exits the daemon if every relay in
* the pool stays disconnected for longer than PARTITION_THRESHOLD_MS.
*
* Replaces the original `pingOrDie` self-echo watchdog, which published
* a kind-24133 event to its own pubkey every 20s and exited if it
* didn't see the echo within 50s. That works on public relays but
* silently breaks on single-private-relay setups: NDK 2.8.1's outbox
* model doesn't reliably route self-publishes back through the
* matching subscription, so the watchdog fires false positives and
* exits the daemon every 50s while RPCs over the same channel still
* work fine. See aiolabs/nsecbunkerd#4 + #7.
*
* The pool-status approach uses NDK's own connection-lifecycle
* tracking `pool.connectedRelays()` reports relays in
* NDKRelayStatus.CONNECTED which is reliable across all relay
* configurations because it doesn't depend on round-trip
* publish/subscribe. No event is published; no relay traffic.
*
* Detects partition within POLL_INTERVAL + PARTITION_THRESHOLD ms.
* Transient disconnects shorter than PARTITION_THRESHOLD don't trip
* the watchdog useful for relays that flap or briefly drop on
* network blips.
*/
async function relayConnectionWatchdog(ndk: NDK) {
const POLL_INTERVAL_MS = 10_000;
const PARTITION_THRESHOLD_MS = 60_000;
let lastConnectedAt = Date.now();
setInterval(() => {
const event = new NDKEvent(ndk, {
kind: NDKKind.NostrConnect,
tags: [ ["p", self.pubkey] ],
content: "ping"
} as NostrEvent);
event.publish().then(() => {
console.log(`🔔 Sent ping event:`, event.created_at);
}).catch((e: any) => {
console.log(`❌ Failed to send ping event:`, e.message);
const connectedCount = ndk.pool.connectedRelays().length;
if (connectedCount > 0) {
lastConnectedAt = Date.now();
return;
}
const elapsed = Date.now() - lastConnectedAt;
if (elapsed > PARTITION_THRESHOLD_MS) {
console.log(`❌ No connected relays for ${Math.floor(elapsed / 1000)}s. Exiting.`);
process.exit(1);
});
}, 20000);
}
}, POLL_INTERVAL_MS);
}
export default AdminInterface;