feat(v2): abandoned-tx queue + force-reset for stuck settlements (P3f)
Completes the P3 operator-UX cluster. Surfaces settlements that didn't
process cleanly as a queryable worklist so operators can investigate +
retry without scanning the full settlement history.
New endpoints:
GET /api/v1/dca/settlements/stuck?threshold_minutes=30
Returns StuckSettlementsResponse with three buckets:
- errored: distribution failed; existing /retry endpoint handles
- stuck_pending: landed but never picked up (listener crashed
before invoking process_settlement)
- stuck_processing: claim taken but no completion in N minutes;
processor crashed mid-flight, processing_claim is set but no
terminal state landed
POST /api/v1/dca/settlements/{id}/force-reset
Operator escape hatch for genuinely stuck settlements. Flips
'pending'/'processing' → 'errored' so the /retry endpoint can take
over. Refuses unless the settlement is older than threshold_minutes
(default 30) so operators can't accidentally interrupt a
slow-but-running settlement. Age check uses created_at as proxy.
CRUD:
- get_stuck_settlements_for_operator(uid, threshold_minutes) joins
dca_settlements → dca_machines and returns the three lists
scoped per operator. No age filter on 'errored' (operators always
want to see those); age filter applies to 'pending'/'processing'.
- force_reset_stuck_settlement(id) UPDATEs 'pending'/'processing' to
'errored', clears processing_claim, sets a marker error_message.
The retry endpoint shipped in fix bundle 1 (commit 3ede66f) is the
intended downstream — operator sees stuck-processing row, hits force-
reset (flips to errored), then hits retry (flips to pending, voids
failed legs, re-runs process_settlement via the claim path).
34 routes registered. 72/72 tests pass.
Refs: aiolabs/satmachineadmin#9 — completes P3 operator-UX cluster
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
3ede66ff92
commit
578f2c142d
3 changed files with 191 additions and 0 deletions
85
views_api.py
85
views_api.py
|
|
@ -20,6 +20,7 @@ from .crud import (
|
|||
delete_dca_client,
|
||||
delete_deposit,
|
||||
delete_machine,
|
||||
force_reset_stuck_settlement,
|
||||
get_client_balance_summary,
|
||||
get_commission_splits,
|
||||
get_dca_client,
|
||||
|
|
@ -35,6 +36,7 @@ from .crud import (
|
|||
get_settlement,
|
||||
get_settlements_for_machine,
|
||||
get_settlements_for_operator,
|
||||
get_stuck_settlements_for_operator,
|
||||
get_super_config,
|
||||
replace_commission_splits,
|
||||
reset_settlement_for_retry,
|
||||
|
|
@ -64,6 +66,7 @@ from .models import (
|
|||
PartialDispenseData,
|
||||
SetCommissionSplitsData,
|
||||
SettleBalanceData,
|
||||
StuckSettlementsResponse,
|
||||
SuperConfig,
|
||||
UpdateDcaClientData,
|
||||
UpdateDepositData,
|
||||
|
|
@ -469,6 +472,88 @@ async def api_partial_dispense(
|
|||
raise HTTPException(HTTPStatus.BAD_REQUEST, str(exc)) from exc
|
||||
|
||||
|
||||
@satmachineadmin_api_router.get(
|
||||
"/api/v1/dca/settlements/stuck", response_model=StuckSettlementsResponse
|
||||
)
|
||||
async def api_list_stuck_settlements(
|
||||
threshold_minutes: int = 30,
|
||||
user: User = Depends(check_user_exists),
|
||||
) -> StuckSettlementsResponse:
|
||||
"""Operator worklist of settlements that didn't process cleanly.
|
||||
|
||||
Returns three lists:
|
||||
- errored: distribution failed; retry endpoint handles these
|
||||
- stuck_pending: landed but never picked up by the processor
|
||||
- stuck_processing: claim taken but no completion in N minutes
|
||||
|
||||
`threshold_minutes` controls the age threshold for 'stuck' (default 30).
|
||||
Operators can force-recover stuck-processing settlements via
|
||||
POST /api/v1/dca/settlements/{id}/force-reset."""
|
||||
if threshold_minutes < 1:
|
||||
raise HTTPException(
|
||||
HTTPStatus.BAD_REQUEST, "threshold_minutes must be >= 1"
|
||||
)
|
||||
buckets = await get_stuck_settlements_for_operator(user.id, threshold_minutes)
|
||||
return StuckSettlementsResponse(
|
||||
threshold_minutes=threshold_minutes,
|
||||
errored=buckets["errored"],
|
||||
stuck_pending=buckets["stuck_pending"],
|
||||
stuck_processing=buckets["stuck_processing"],
|
||||
)
|
||||
|
||||
|
||||
@satmachineadmin_api_router.post(
|
||||
"/api/v1/dca/settlements/{settlement_id}/force-reset",
|
||||
response_model=DcaSettlement,
|
||||
)
|
||||
async def api_force_reset_settlement(
|
||||
settlement_id: str,
|
||||
threshold_minutes: int = 30,
|
||||
user: User = Depends(check_user_exists),
|
||||
) -> DcaSettlement:
|
||||
"""Operator escape hatch for genuinely stuck settlements (processor
|
||||
crashed mid-flight, claim never released). Flips status
|
||||
'pending'/'processing' → 'errored' so the retry endpoint can take over.
|
||||
|
||||
Refuses unless the settlement is older than `threshold_minutes` so an
|
||||
operator can't accidentally interrupt a slow-but-running settlement.
|
||||
Threshold check uses created_at as a proxy — adequate for v1 since the
|
||||
processor either completes fast or it crashed."""
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
settlement = await get_settlement(settlement_id)
|
||||
if settlement is None:
|
||||
raise HTTPException(HTTPStatus.NOT_FOUND, "Settlement not found")
|
||||
machine = await get_machine(settlement.machine_id)
|
||||
if machine is None or machine.operator_user_id != user.id:
|
||||
raise HTTPException(HTTPStatus.NOT_FOUND, "Settlement not found")
|
||||
if settlement.status not in ("pending", "processing"):
|
||||
raise HTTPException(
|
||||
HTTPStatus.BAD_REQUEST,
|
||||
f"settlement status must be 'pending' or 'processing' to "
|
||||
f"force-reset (currently '{settlement.status}')",
|
||||
)
|
||||
# Age check — refuse if settlement is fresh (processor might still
|
||||
# be running normally). Both sides made timezone-aware before compare.
|
||||
created = settlement.created_at
|
||||
if created.tzinfo is None:
|
||||
created = created.replace(tzinfo=timezone.utc)
|
||||
age = datetime.now(timezone.utc) - created
|
||||
if age < timedelta(minutes=threshold_minutes):
|
||||
raise HTTPException(
|
||||
HTTPStatus.BAD_REQUEST,
|
||||
f"settlement is only {age.total_seconds() / 60:.1f} minutes "
|
||||
f"old (threshold {threshold_minutes}m); refusing to force-reset "
|
||||
"a possibly-still-running settlement",
|
||||
)
|
||||
updated = await force_reset_stuck_settlement(settlement_id)
|
||||
if updated is None:
|
||||
raise HTTPException(
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR, "failed to force-reset"
|
||||
)
|
||||
return updated
|
||||
|
||||
|
||||
@satmachineadmin_api_router.post(
|
||||
"/api/v1/dca/settlements/{settlement_id}/retry",
|
||||
response_model=DcaSettlement,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue