Compare commits

..

No commits in common. "fb1c239e152c2db8ce567afe495e1461ce49ce6d" and "662dd21a60acbd4ae12225af2736b4c2cd6fc8be" have entirely different histories.

View file

@ -1,5 +1,5 @@
import "websocket-polyfill";
import NDK, { NDKEvent, NDKKind, NDKPrivateKeySigner, NDKRpcRequest, NDKRpcResponse, NDKUser } from '@nostr-dev-kit/ndk';
import NDK, { NDKEvent, NDKKind, NDKPrivateKeySigner, NDKRpcRequest, NDKRpcResponse, NDKUser, NostrEvent } from '@nostr-dev-kit/ndk';
import { NDKNostrRpc } from '@nostr-dev-kit/ndk';
import createDebug from 'debug';
import { Key, KeyUser } from '../run';
@ -168,17 +168,12 @@ class AdminInterface {
this.handleRequest(req);
});
// Connection watchdog: exit if pool reports no connected relays
// for >60s so the process supervisor (systemd / docker restart
// policy / k8s) can recover. Replaces the original self-echo
// pingOrDie — see relayConnectionWatchdog comment + #4 + #7.
// Operators with external liveness checking can disable via
// NSEC_BUNKER_DISABLE_WATCHDOG=1.
if (process.env.NSEC_BUNKER_DISABLE_WATCHDOG !== '1') {
relayConnectionWatchdog(this.ndk);
} else {
console.log('⏸ watchdog disabled via NSEC_BUNKER_DISABLE_WATCHDOG=1');
}
// pingOrDie disabled — NDK 2.8.1 outbox model doesn't echo
// self-published events back through subscriptions on
// non-public relay channels, so the watchdog fires false
// positives and exits the bunker every 50s on private relays.
// See aiolabs/nsecbunkerd#4 + #7.
// pingOrDie(this.ndk);
}).catch((err) => {
console.log('❌ admin connection failed');
console.log(err);
@ -459,47 +454,44 @@ class AdminInterface {
}
}
/**
* Pool-status connection watchdog. Exits the daemon if every relay in
* the pool stays disconnected for longer than PARTITION_THRESHOLD_MS.
*
* Replaces the original `pingOrDie` self-echo watchdog, which published
* a kind-24133 event to its own pubkey every 20s and exited if it
* didn't see the echo within 50s. That works on public relays but
* silently breaks on single-private-relay setups: NDK 2.8.1's outbox
* model doesn't reliably route self-publishes back through the
* matching subscription, so the watchdog fires false positives and
* exits the daemon every 50s while RPCs over the same channel still
* work fine. See aiolabs/nsecbunkerd#4 + #7.
*
* The pool-status approach uses NDK's own connection-lifecycle
* tracking `pool.connectedRelays()` reports relays in
* NDKRelayStatus.CONNECTED which is reliable across all relay
* configurations because it doesn't depend on round-trip
* publish/subscribe. No event is published; no relay traffic.
*
* Detects partition within POLL_INTERVAL + PARTITION_THRESHOLD ms.
* Transient disconnects shorter than PARTITION_THRESHOLD don't trip
* the watchdog useful for relays that flap or briefly drop on
* network blips.
*/
async function relayConnectionWatchdog(ndk: NDK) {
const POLL_INTERVAL_MS = 10_000;
const PARTITION_THRESHOLD_MS = 60_000;
let lastConnectedAt = Date.now();
async function pingOrDie(ndk: NDK) {
let deathTimer: NodeJS.Timeout | null = null;
function resetDeath() {
if (deathTimer) clearTimeout(deathTimer);
deathTimer = setTimeout(() => {
console.log(`❌ No ping event received in 30 seconds. Exiting.`);
process.exit(1);
}, 50000);
}
const self = await ndk.signer!.user();
const sub = ndk.subscribe({
authors: [self.pubkey],
kinds: [NDKKind.NostrConnect],
"#p": [self.pubkey]
});
sub.on("event", (event: NDKEvent) => {
console.log(`🔔 Received ping event:`, event.created_at);
resetDeath();
});
sub.start();
resetDeath();
setInterval(() => {
const connectedCount = ndk.pool.connectedRelays().length;
if (connectedCount > 0) {
lastConnectedAt = Date.now();
return;
}
const elapsed = Date.now() - lastConnectedAt;
if (elapsed > PARTITION_THRESHOLD_MS) {
console.log(`❌ No connected relays for ${Math.floor(elapsed / 1000)}s. Exiting.`);
const event = new NDKEvent(ndk, {
kind: NDKKind.NostrConnect,
tags: [ ["p", self.pubkey] ],
content: "ping"
} as NostrEvent);
event.publish().then(() => {
console.log(`🔔 Sent ping event:`, event.created_at);
}).catch((e: any) => {
console.log(`❌ Failed to send ping event:`, e.message);
process.exit(1);
}
}, POLL_INTERVAL_MS);
});
}, 20000);
}
export default AdminInterface;