release(v4.2.1): fix concurrent-ratchet desync via OutboundQueue waiter cursor
Some checks failed
Publish / publish (push) Has been cancelled
Docker build and publish / docker (push) Has been cancelled

Pull-mode httpClient + drainer + parallel RPCs against the same peer
deteriorated after ~10s with `DecryptionError`. Two bugs combined:

- `OutboundQueue.enqueue` woke `drain` waiters with a `since=0`
  snapshot, replaying already-processed events into
  `Shade.acceptTransferEnvelope` → `manager.decrypt` twice. The
  duplicate consumed an already-used skipped key and corrupted the
  Double Ratchet receive chain.

- `ratchetDecrypt` then propagated the corruption: a same-DH
  message behind the chain with no cached skipped key fell through
  to `kdfChainKey` on the ahead state and rewound `chain.counter`,
  permanently desyncing the chain.

Fix `OutboundQueue` to honor each waiter's `since`, and harden
`ratchetDecrypt` so any future duplicate fails cleanly without
mutating state. Adds regression coverage at all three layers.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-04 22:58:26 +02:00
parent 7520b11b25
commit b77b7e771c
30 changed files with 380 additions and 29 deletions

View File

@@ -1,6 +1,6 @@
{
"name": "@shade/transfer",
"version": "4.2.0",
"version": "4.2.1",
"type": "module",
"main": "src/index.ts",
"types": "src/index.ts",

View File

@@ -81,6 +81,14 @@ const DEFAULT_IDLE_EVICTION_MS = 10 * 60 * 1000;
interface PendingWaiter {
resolve(events: QueuedEvent[]): void;
reject(err: Error): void;
/**
* The waiter's `since` cursor — only events with `id > since` should
* be delivered when this waiter is resolved. Without this, an
* enqueue that arrives while a poller is waiting would replay
* already-processed events, causing the receiver to double-decrypt
* (and corrupt ratchet state).
*/
since: number;
timer: ReturnType<typeof setTimeout>;
abortHandler?: () => void;
signal?: AbortSignal;
@@ -140,16 +148,21 @@ export class OutboundQueue {
// last polled id; the @shade/transfer engine handles missing seqs
// by re-sending on resume.
while (state.events.length > this.maxEvents) state.events.shift();
// Wake all waiters with whatever has accumulated.
const drained = this.collect(state, 0);
if (drained.length > 0) {
// Wake each waiter with events newer than ITS OWN `since`. Using a
// shared snapshot from `since=0` would replay events the waiter has
// already processed once a fresh enqueue arrived mid-poll, which on
// the receiver side double-dispatches an envelope into shade.receive
// → manager.decrypt and consumes the same skipped-key twice (the
// second dispatch corrupts the ratchet chain).
if (state.waiters.length > 0) {
const waiters = state.waiters.splice(0);
for (const w of waiters) {
clearTimeout(w.timer);
if (w.abortHandler !== undefined && w.signal !== undefined) {
w.signal.removeEventListener('abort', w.abortHandler);
}
w.resolve(drained);
const wDrained = this.collect(state, w.since);
w.resolve(wDrained);
}
}
return event;
@@ -181,7 +194,7 @@ export class OutboundQueue {
// Empty drain on timeout — that's the "no new events" signal.
resolve([]);
}, blockMs);
const waiter: PendingWaiter = { resolve, reject, timer };
const waiter: PendingWaiter = { resolve, reject, since, timer };
if (signal !== undefined) {
const handler = () => {
const idx = state.waiters.indexOf(waiter);

View File

@@ -0,0 +1,60 @@
import { describe, expect, test } from 'bun:test';
import { OutboundQueue } from '../src/index.js';
/**
* Regression coverage for the long-poll waiter `since` cursor.
*
* The bug being guarded against: when `enqueue` woke a pending
* `drain` waiter, it used a `since=0` snapshot and replayed every
* event that had ever been queued — including the ones the waiter
* had already processed in a previous poll. Downstream the queue
* fed `Shade.acceptTransferEnvelope`, so the duplicate replay
* dispatched the same envelope into `manager.decrypt` twice. The
* second decrypt consumed an already-used skipped key, fell into
* the stale-counter branch of `ratchetDecrypt`, and corrupted the
* Double Ratchet receive chain — surfacing as
* `DecryptionError: wrong key or tampered data` on every
* subsequent message.
*/
describe('OutboundQueue — waiter since cursor', () => {
test('mid-poll enqueue must not replay events the waiter already saw', async () => {
const queue = new OutboundQueue({ idleEvictionMs: 0 });
const peer = 'alice';
const e1 = queue.enqueue(peer, { kind: 'envelope', bytes: new Uint8Array([1]) });
const e2 = queue.enqueue(peer, { kind: 'envelope', bytes: new Uint8Array([2]) });
// First poll drains both events (no blocking — they're already there).
const first = await queue.drain(peer, 0, 0);
expect(first.map((e) => e.id)).toEqual([e1.id, e2.id]);
// Now the waiter polls past the last seen id. It blocks because
// there are no events newer than `since`. Concurrently a fresh
// event gets enqueued — that's the path the bug fired on.
const blockMs = 5_000;
const polling = queue.drain(peer, e2.id, blockMs);
// Yield so `drain` actually parks on the waiter list before we
// race the enqueue against it.
await Promise.resolve();
const e3 = queue.enqueue(peer, { kind: 'envelope', bytes: new Uint8Array([3]) });
const woken = await polling;
// Pre-fix: would resolve with [e1, e2, e3] (a `since=0` snapshot
// drained verbatim). Post-fix: only the events newer than the
// waiter's recorded `since` come through.
expect(woken.map((e) => e.id)).toEqual([e3.id]);
});
test('parked waiter at the head still gets the new event when others have polled past it', async () => {
const queue = new OutboundQueue({ idleEvictionMs: 0 });
const peer = 'alice';
const e1 = queue.enqueue(peer, { kind: 'envelope', bytes: new Uint8Array([1]) });
// A waiter that parks past the head — there are no events newer
// than e1.id, so it has to block.
const polling = queue.drain(peer, e1.id, 5_000);
await Promise.resolve();
const e2 = queue.enqueue(peer, { kind: 'envelope', bytes: new Uint8Array([2]) });
const woken = await polling;
expect(woken.map((e) => e.id)).toEqual([e2.id]);
});
});