fix(#4): replace pingOrDie self-echo watchdog with pool-status check
The original watchdog published a kind-24133 event to its own pubkey
every 20s and exited if no echo arrived within 50s. On a single private
relay setup (LNbits's nostrrelay extension channel), NDK 2.8.1's outbox
model doesn't reliably route self-publishes back through the matching
subscription, so the watchdog fires false positives and exits every 50s
even though admin RPCs over the same channel still work fine. The
upstream patches we landed previously (commit 42dbbd7) commented the
call out as an emergency stopgap; this commit replaces the mechanism
with one that actually answers the right question.
Pool-status watchdog: poll `ndk.pool.connectedRelays().length` every
10s, track the most recent moment any relay was connected, exit if no
relay has been connected for 60s. Uses NDK's own connection-lifecycle
tracking which works reliably across all relay configurations — no
self-publish, no subscription dependency, no relay traffic. Same intent
as pingOrDie (detect partition from relay layer and let the supervisor
restart us), reliable signal.
Call site re-enable + env-flag opt-out follow in the next commit.
Drops the now-unused NostrEvent import.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
662dd21a60
commit
1792bc489c
1 changed files with 39 additions and 36 deletions
|
|
@ -1,5 +1,5 @@
|
|||
import "websocket-polyfill";
|
||||
import NDK, { NDKEvent, NDKKind, NDKPrivateKeySigner, NDKRpcRequest, NDKRpcResponse, NDKUser, NostrEvent } from '@nostr-dev-kit/ndk';
|
||||
import NDK, { NDKEvent, NDKKind, NDKPrivateKeySigner, NDKRpcRequest, NDKRpcResponse, NDKUser } from '@nostr-dev-kit/ndk';
|
||||
import { NDKNostrRpc } from '@nostr-dev-kit/ndk';
|
||||
import createDebug from 'debug';
|
||||
import { Key, KeyUser } from '../run';
|
||||
|
|
@ -454,44 +454,47 @@ class AdminInterface {
|
|||
}
|
||||
}
|
||||
|
||||
async function pingOrDie(ndk: NDK) {
|
||||
let deathTimer: NodeJS.Timeout | null = null;
|
||||
|
||||
function resetDeath() {
|
||||
if (deathTimer) clearTimeout(deathTimer);
|
||||
deathTimer = setTimeout(() => {
|
||||
console.log(`❌ No ping event received in 30 seconds. Exiting.`);
|
||||
process.exit(1);
|
||||
}, 50000);
|
||||
}
|
||||
|
||||
const self = await ndk.signer!.user();
|
||||
const sub = ndk.subscribe({
|
||||
authors: [self.pubkey],
|
||||
kinds: [NDKKind.NostrConnect],
|
||||
"#p": [self.pubkey]
|
||||
});
|
||||
sub.on("event", (event: NDKEvent) => {
|
||||
console.log(`🔔 Received ping event:`, event.created_at);
|
||||
resetDeath();
|
||||
});
|
||||
sub.start();
|
||||
|
||||
resetDeath();
|
||||
/**
|
||||
* Pool-status connection watchdog. Exits the daemon if every relay in
|
||||
* the pool stays disconnected for longer than PARTITION_THRESHOLD_MS.
|
||||
*
|
||||
* Replaces the original `pingOrDie` self-echo watchdog, which published
|
||||
* a kind-24133 event to its own pubkey every 20s and exited if it
|
||||
* didn't see the echo within 50s. That works on public relays but
|
||||
* silently breaks on single-private-relay setups: NDK 2.8.1's outbox
|
||||
* model doesn't reliably route self-publishes back through the
|
||||
* matching subscription, so the watchdog fires false positives and
|
||||
* exits the daemon every 50s while RPCs over the same channel still
|
||||
* work fine. See aiolabs/nsecbunkerd#4 + #7.
|
||||
*
|
||||
* The pool-status approach uses NDK's own connection-lifecycle
|
||||
* tracking — `pool.connectedRelays()` reports relays in
|
||||
* NDKRelayStatus.CONNECTED — which is reliable across all relay
|
||||
* configurations because it doesn't depend on round-trip
|
||||
* publish/subscribe. No event is published; no relay traffic.
|
||||
*
|
||||
* Detects partition within POLL_INTERVAL + PARTITION_THRESHOLD ms.
|
||||
* Transient disconnects shorter than PARTITION_THRESHOLD don't trip
|
||||
* the watchdog — useful for relays that flap or briefly drop on
|
||||
* network blips.
|
||||
*/
|
||||
async function relayConnectionWatchdog(ndk: NDK) {
|
||||
const POLL_INTERVAL_MS = 10_000;
|
||||
const PARTITION_THRESHOLD_MS = 60_000;
|
||||
let lastConnectedAt = Date.now();
|
||||
|
||||
setInterval(() => {
|
||||
const event = new NDKEvent(ndk, {
|
||||
kind: NDKKind.NostrConnect,
|
||||
tags: [ ["p", self.pubkey] ],
|
||||
content: "ping"
|
||||
} as NostrEvent);
|
||||
event.publish().then(() => {
|
||||
console.log(`🔔 Sent ping event:`, event.created_at);
|
||||
}).catch((e: any) => {
|
||||
console.log(`❌ Failed to send ping event:`, e.message);
|
||||
const connectedCount = ndk.pool.connectedRelays().length;
|
||||
if (connectedCount > 0) {
|
||||
lastConnectedAt = Date.now();
|
||||
return;
|
||||
}
|
||||
const elapsed = Date.now() - lastConnectedAt;
|
||||
if (elapsed > PARTITION_THRESHOLD_MS) {
|
||||
console.log(`❌ No connected relays for ${Math.floor(elapsed / 1000)}s. Exiting.`);
|
||||
process.exit(1);
|
||||
});
|
||||
}, 20000);
|
||||
}
|
||||
}, POLL_INTERVAL_MS);
|
||||
}
|
||||
|
||||
export default AdminInterface;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue