Files
Toju/toju-app/src/app/infrastructure/realtime/peer-connection-manager/recovery/peer-recovery.ts
Myx de2d3300d4 fix: Fix users unable to see or hear each other in voice channels due to
stale server sockets, passive non-initiators, and race conditions
during peer connection setup.

Fix users unable to see or hear each other in voice channels due to
stale server sockets, passive non-initiators, and race conditions
during peer connection setup.

Server:
- Close stale WebSocket connections sharing the same oderId in
  handleIdentify instead of letting them linger up to 45s
- Make user_joined/user_left broadcasts identity-aware so duplicate
  sockets don't produce phantom join/leave events
- Include serverIds in user_left payload for multi-room presence
- Simplify findUserByOderId now that stale sockets are cleaned up

Client - signaling:
- Add fallback offer system with 1s timer for missed user_joined races
- Add non-initiator takeover after 5s when the initiator fails to send
  an offer (NON_INITIATOR_GIVE_UP_MS)
- Scope peerServerMap per signaling URL to prevent cross-server
  collisions
- Add socket identity guards on all signaling event handlers
- Replace canReusePeerConnection with hasActivePeerConnection and
  isPeerConnectionNegotiating with extended grace periods

Client - peer connections:
- Extract replaceUnusablePeer helper to deduplicate stale peer
  replacement in offer and ICE handlers
- Add stale connectionstatechange guard to ignore events from replaced
  RTCPeerConnection instances
- Use deterministic initiator election in peer recovery reconnects
- Track createdAt on PeerData for staleness detection

Client - presence:
- Add multi-room presence tracking via presenceServerIds on User
- Replace clearUsers + individual userJoined with syncServerPresence
  for atomic server roster updates
- Make userLeft handle partial server removal instead of full eviction

Documentation:
- Add server-side connection hygiene, non-initiator takeover, and stale
  peer replacement sections to the realtime README
2026-04-04 02:47:58 +02:00

290 lines
8.2 KiB
TypeScript

import {
CONNECTION_STATE_CONNECTED,
DATA_CHANNEL_STATE_OPEN,
P2P_TYPE_VOICE_STATE_REQUEST,
PEER_DISCONNECT_GRACE_MS,
PEER_RECONNECT_INTERVAL_MS,
PEER_RECONNECT_MAX_ATTEMPTS
} from '../../realtime.constants';
import {
PeerConnectionManagerContext,
PeerConnectionManagerState,
RecoveryHandlers,
RemovePeerOptions
} from '../shared';
import { clearAllPingTimers, stopPingInterval } from '../messaging/ping';
/**
* Close and remove a peer connection, data channel, and emit a disconnect event.
*/
export function removePeer(
context: PeerConnectionManagerContext,
peerId: string,
options?: RemovePeerOptions
): void {
const { state } = context;
const peerData = state.activePeerConnections.get(peerId);
const preserveReconnectState = options?.preserveReconnectState === true;
clearPeerDisconnectGraceTimer(state, peerId);
if (!preserveReconnectState) {
clearPeerReconnectTimer(state, peerId);
state.disconnectedPeerTracker.delete(peerId);
}
state.remotePeerStreams.delete(peerId);
state.remotePeerVoiceStreams.delete(peerId);
state.remotePeerScreenShareStreams.delete(peerId);
if (peerData) {
if (peerData.dataChannel)
peerData.dataChannel.close();
peerData.connection.close();
state.activePeerConnections.delete(peerId);
state.peerNegotiationQueue.delete(peerId);
removeFromConnectedPeers(state, peerId);
stopPingInterval(state, peerId);
state.peerLatencies.delete(peerId);
state.pendingPings.delete(peerId);
state.peerDisconnected$.next(peerId);
}
}
/** Close every active peer connection and clear internal state. */
export function closeAllPeers(state: PeerConnectionManagerState): void {
clearAllPeerReconnectTimers(state);
clearAllPeerDisconnectGraceTimers(state);
clearAllPingTimers(state);
state.activePeerConnections.forEach((peerData) => {
if (peerData.dataChannel)
peerData.dataChannel.close();
peerData.connection.close();
});
state.activePeerConnections.clear();
state.remotePeerStreams.clear();
state.remotePeerVoiceStreams.clear();
state.remotePeerScreenShareStreams.clear();
state.peerNegotiationQueue.clear();
state.peerLatencies.clear();
state.pendingPings.clear();
state.connectedPeersChanged$.next([]);
}
export function trackDisconnectedPeer(state: PeerConnectionManagerState, peerId: string): void {
state.disconnectedPeerTracker.set(peerId, {
lastSeenTimestamp: Date.now(),
reconnectAttempts: 0
});
}
export function clearPeerReconnectTimer(
state: PeerConnectionManagerState,
peerId: string
): void {
const timer = state.peerReconnectTimers.get(peerId);
if (timer) {
clearInterval(timer);
state.peerReconnectTimers.delete(peerId);
}
}
export function clearPeerDisconnectGraceTimer(
state: PeerConnectionManagerState,
peerId: string
): void {
const timer = state.peerDisconnectGraceTimers.get(peerId);
if (timer) {
clearTimeout(timer);
state.peerDisconnectGraceTimers.delete(peerId);
}
}
/** Cancel all pending peer reconnect timers and clear the tracker. */
export function clearAllPeerReconnectTimers(state: PeerConnectionManagerState): void {
state.peerReconnectTimers.forEach((timer) => clearInterval(timer));
state.peerReconnectTimers.clear();
state.disconnectedPeerTracker.clear();
}
export function clearAllPeerDisconnectGraceTimers(state: PeerConnectionManagerState): void {
state.peerDisconnectGraceTimers.forEach((timer) => clearTimeout(timer));
state.peerDisconnectGraceTimers.clear();
}
export function schedulePeerDisconnectRecovery(
context: PeerConnectionManagerContext,
peerId: string,
handlers: RecoveryHandlers
): void {
const { logger, state } = context;
if (state.peerDisconnectGraceTimers.has(peerId))
return;
logger.warn('Peer temporarily disconnected; waiting before reconnect', { peerId });
const timer = setTimeout(() => {
state.peerDisconnectGraceTimers.delete(peerId);
const peerData = state.activePeerConnections.get(peerId);
if (!peerData)
return;
const connectionState = peerData.connection.connectionState;
if (connectionState === CONNECTION_STATE_CONNECTED || connectionState === 'connecting') {
logger.info('Peer recovered before disconnect grace expired', {
peerId,
state: connectionState
});
return;
}
logger.warn('Peer still disconnected after grace period; recreating connection', {
peerId,
state: connectionState
});
trackDisconnectedPeer(state, peerId);
handlers.removePeer(peerId, { preserveReconnectState: true });
schedulePeerReconnect(context, peerId, handlers);
}, PEER_DISCONNECT_GRACE_MS);
state.peerDisconnectGraceTimers.set(peerId, timer);
}
export function schedulePeerReconnect(
context: PeerConnectionManagerContext,
peerId: string,
handlers: RecoveryHandlers
): void {
const { callbacks, logger, state } = context;
if (state.peerReconnectTimers.has(peerId))
return;
logger.info('Scheduling P2P reconnect', { peerId });
const timer = setInterval(() => {
const info = state.disconnectedPeerTracker.get(peerId);
if (!info) {
clearPeerReconnectTimer(state, peerId);
return;
}
info.reconnectAttempts++;
logger.info('P2P reconnect attempt', {
peerId,
attempt: info.reconnectAttempts
});
if (info.reconnectAttempts >= PEER_RECONNECT_MAX_ATTEMPTS) {
logger.info('P2P reconnect max attempts reached', { peerId });
clearPeerReconnectTimer(state, peerId);
state.disconnectedPeerTracker.delete(peerId);
return;
}
if (!callbacks.isSignalingConnected()) {
logger.info('Skipping P2P reconnect - no signaling connection', { peerId });
return;
}
attemptPeerReconnect(context, peerId, handlers);
}, PEER_RECONNECT_INTERVAL_MS);
state.peerReconnectTimers.set(peerId, timer);
}
export function attemptPeerReconnect(
context: PeerConnectionManagerContext,
peerId: string,
handlers: RecoveryHandlers
): void {
const { callbacks, logger, state } = context;
if (state.activePeerConnections.has(peerId)) {
handlers.removePeer(peerId, { preserveReconnectState: true });
}
const localOderId = callbacks.getIdentifyCredentials()?.oderId ?? null;
if (!localOderId) {
logger.info('Skipping reconnect offer until logical identity is ready', { peerId });
handlers.createPeerConnection(peerId, false);
return;
}
const shouldInitiate = peerId !== localOderId && localOderId < peerId;
handlers.createPeerConnection(peerId, shouldInitiate);
if (shouldInitiate) {
void handlers.createAndSendOffer(peerId);
return;
}
logger.info('Waiting for remote reconnect offer based on deterministic initiator selection', {
localOderId,
peerId
});
}
export function requestVoiceStateFromPeer(
state: PeerConnectionManagerState,
logger: PeerConnectionManagerContext['logger'],
peerId: string
): void {
const peerData = state.activePeerConnections.get(peerId);
if (peerData?.dataChannel?.readyState === DATA_CHANNEL_STATE_OPEN) {
try {
peerData.dataChannel.send(JSON.stringify({ type: P2P_TYPE_VOICE_STATE_REQUEST }));
} catch (error) {
logger.warn('Failed to request voice state', error);
}
}
}
/** Return a snapshot copy of the currently-connected peer IDs. */
export function getConnectedPeerIds(state: PeerConnectionManagerState): string[] {
return [...state.connectedPeersList];
}
export function addToConnectedPeers(state: PeerConnectionManagerState, peerId: string): void {
if (!state.connectedPeersList.includes(peerId)) {
state.connectedPeersList = [...state.connectedPeersList, peerId];
state.connectedPeersChanged$.next(state.connectedPeersList);
}
}
/**
* Remove a peer from the connected list and notify subscribers.
*/
export function removeFromConnectedPeers(
state: PeerConnectionManagerState,
peerId: string
): void {
state.connectedPeersList = state.connectedPeersList.filter(
(connectedId) => connectedId !== peerId
);
state.connectedPeersChanged$.next(state.connectedPeersList);
}
/** Reset the connected peers list to empty and notify subscribers. */
export function resetConnectedPeers(state: PeerConnectionManagerState): void {
state.connectedPeersList = [];
state.connectedPeersChanged$.next([]);
}