Files
Toju/public/voice-leveling-worklet.js
2026-03-03 03:41:59 +01:00

443 lines
16 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* VoiceLevelingProcessor — AudioWorkletProcessor that implements
* broadcast-grade per-speaker automatic gain control (AGC).
*
* ═══════════════════════════════════════════════════════════════════
* DSP DESIGN NOTES
* ═══════════════════════════════════════════════════════════════════
*
* This processor mimics WebRTC's Gain Controller 2 (AGC2) behaviour
* using a lightweight algorithm suitable for real-time voice in an
* AudioWorklet thread.
*
* Pipeline (per 128-sample render quantum ≈ 2.67 ms @ 48 kHz):
*
* 1. RMS level estimation (short-term envelope)
* 2. Silence gate (freeze gain when below noise floor)
* 3. Target gain compute (desired dBFS → linear gain)
* 4. Gain smoothing (exponential attack / release)
* 5. Max-gain clamp (prevent runaway boost)
* 6. Soft-clip limiter (prevent digital overs)
*
* Key properties:
* • No per-frame allocation — all buffers pre-allocated.
* • Synchronous processing — no message passing in hot path.
* • Uses Float32 throughout — native AudioWorklet format.
* • 128-sample quantum fits within 10 ms at 48 kHz (2.67 ms).
*
* The processor receives configuration via AudioWorkletNode.port
* messages and applies them on the next render quantum.
*
* ═══════════════════════════════════════════════════════════════════
*/
/* ──────────────────────────────────────────────────────────────── */
/* Constants */
/* ──────────────────────────────────────────────────────────────── */
/** Processor name registered with `registerProcessor`. */
const PROCESSOR_NAME = 'VoiceLevelingProcessor';
/**
* Web Audio render quantum size — the number of samples processed
* in each call to `process()`. The AudioWorklet spec mandates 128.
*/
const RENDER_QUANTUM_FRAMES = 128;
/**
* Minimum RMS level (linear) below which the input is considered
* silence. Gain is frozen/decayed when the signal is this quiet.
* Roughly 60 dBFS.
*/
const DEFAULT_SILENCE_THRESHOLD = 0.001;
/**
* The target RMS level in dBFS. 18 dBFS is a comfortable
* conversational loudness for headphone listening.
*/
const DEFAULT_TARGET_DBFS = -18;
/** Default maximum gain boost in dB. */
const DEFAULT_MAX_GAIN_DB = 12;
/** Soft-clip ceiling — prevents digital overs. */
const SOFT_CLIP_THRESHOLD = 0.95;
/**
* Speed presets: attack and release time constants (seconds).
*
* Attack = how fast gain *decreases* when a loud signal arrives.
* Release = how fast gain *increases* when the signal gets quieter.
*
* Asymmetric: fast attack prevents clipping, slow release sounds
* natural and avoids "pumping".
*/
const SPEED_PRESETS = {
slow: { attack: 0.015, release: 0.800 },
medium: { attack: 0.010, release: 0.400 },
fast: { attack: 0.005, release: 0.150 },
};
/**
* AGC strength presets: scale the computed gain adjustment.
* 1.0 = full correction toward target; lower = gentler leveling.
*/
const STRENGTH_PRESETS = {
low: 0.5,
medium: 0.75,
high: 1.0,
};
/**
* When silence is detected, the gain decays toward 1.0 (unity)
* at this rate (seconds). This prevents the gain from sitting at
* a huge value after long silence and then blasting when speech
* resumes.
*/
const SILENCE_DECAY_TC = 2.0;
/* ──────────────────────────────────────────────────────────────── */
/* Helpers */
/* ──────────────────────────────────────────────────────────────── */
/** Convert decibels to linear gain. */
function dbToLinear(db) {
return Math.pow(10, db / 20);
}
/** Convert linear amplitude to dBFS. Returns Infinity for 0. */
function linearToDb(linear) {
if (linear <= 0) return -Infinity;
return 20 * Math.log10(linear);
}
/**
* Compute the exponential smoothing coefficient (α) for a given
* time constant and **frame rate** (not sample rate!).
*
* Because the envelope / gain update runs once per render quantum
* (128 samples), the rate passed here must be frames-per-second
* (sampleRate / 128), NOT samples-per-second. Using the raw
* sampleRate would produce absurdly small α values, making the
* AGC appear frozen.
*
* α = 1 e^(1 / (tc * fps))
*
* Larger α → faster response.
*
* @param {number} tc Time constant in seconds.
* @param {number} fps Frame rate (render quanta per second).
* @returns {number} Smoothing coefficient (01).
*/
function timeConstantToAlpha(tc, fps) {
if (tc <= 0) return 1.0;
return 1.0 - Math.exp(-1.0 / (tc * fps));
}
/**
* Attempt to use SharedArrayBuffer for the envelope history if
* the environment supports it. Falls back to a regular
* Float32Array.
*
* @param {number} length Number of elements.
* @returns {Float32Array}
*/
function allocateBuffer(length) {
try {
if (typeof SharedArrayBuffer !== 'undefined') {
return new Float32Array(new SharedArrayBuffer(length * 4));
}
} catch { /* fall through */ }
return new Float32Array(length);
}
/**
* Soft-clip function (tanh-based) that prevents digital overs
* while preserving signal shape.
*
* Below the threshold the signal passes through unchanged.
* Above it, tanh compression is applied symmetrically.
*
* @param {number} sample Input sample.
* @returns {number} Clipped sample.
*/
function softClip(sample) {
const abs = Math.abs(sample);
if (abs <= SOFT_CLIP_THRESHOLD) return sample;
const sign = sample >= 0 ? 1 : -1;
// Map (threshold..∞) → (threshold..1) using tanh
const excess = (abs - SOFT_CLIP_THRESHOLD) / (1 - SOFT_CLIP_THRESHOLD);
return sign * (SOFT_CLIP_THRESHOLD + (1 - SOFT_CLIP_THRESHOLD) * Math.tanh(excess));
}
/* ──────────────────────────────────────────────────────────────── */
/* Processor */
/* ──────────────────────────────────────────────────────────────── */
class VoiceLevelingProcessor extends AudioWorkletProcessor {
/* ── State ──────────────────────────────────────────────────── */
/** Whether processing is enabled (bypass when false). */
_enabled = true;
/** Target loudness in dBFS. */
_targetDbfs = DEFAULT_TARGET_DBFS;
/** Maximum gain boost in dB. */
_maxGainDb = DEFAULT_MAX_GAIN_DB;
/** Linear ceiling for the gain multiplier. */
_maxGainLinear = dbToLinear(DEFAULT_MAX_GAIN_DB);
/** AGC strength factor (01). Scales the gain correction. */
_strength = STRENGTH_PRESETS.medium;
/** Whether the silence/noise gate is active. */
_noiseGateEnabled = false;
/** RMS threshold below which input is treated as silence. */
_silenceThreshold = DEFAULT_SILENCE_THRESHOLD;
/** Attack smoothing coefficient. */
_alphaAttack = 0;
/** Release smoothing coefficient. */
_alphaRelease = 0;
/** Silence decay smoothing coefficient. */
_alphaSilenceDecay = 0;
/**
* Running RMS envelope (squared, to avoid sqrt every frame).
* Smoothed with a one-pole filter.
*/
_envelopeSq = 0;
/** Current applied gain (linear). Smoothed toward target. */
_currentGain = 1.0;
/**
* Pre-allocated buffer used for RMS computation.
* Sized to the largest possible render quantum (128 samples).
*/
_scratchBuffer = allocateBuffer(128);
/* ── Constructor ────────────────────────────────────────────── */
constructor(options) {
super(options);
// Compute smoothing coefficients from default speed
this._applySpeed('medium');
// Listen for configuration changes from the main thread.
// Messages are consumed before the next render quantum.
this.port.onmessage = (event) => this._handleMessage(event.data);
}
/* ── Configuration ──────────────────────────────────────────── */
/**
* Handle a configuration message from the main thread.
*
* Accepted keys:
* enabled : boolean
* targetDbfs : number (-30 … -12)
* maxGainDb : number (3 … 20)
* strength : 'low' | 'medium' | 'high'
* speed : 'slow' | 'medium' | 'fast'
* noiseGate : boolean
*
* @param {object} msg
*/
_handleMessage(msg) {
if (msg == null || typeof msg !== 'object') return;
if (typeof msg.enabled === 'boolean') {
this._enabled = msg.enabled;
if (!msg.enabled) {
// Reset gain to unity on disable so re-enabling starts clean
this._currentGain = 1.0;
this._envelopeSq = 0;
}
}
if (typeof msg.targetDbfs === 'number') {
this._targetDbfs = Math.max(-30, Math.min(-12, msg.targetDbfs));
}
if (typeof msg.maxGainDb === 'number') {
const clamped = Math.max(3, Math.min(20, msg.maxGainDb));
this._maxGainDb = clamped;
this._maxGainLinear = dbToLinear(clamped);
}
if (typeof msg.strength === 'string' && STRENGTH_PRESETS[msg.strength] != null) {
this._strength = STRENGTH_PRESETS[msg.strength];
}
if (typeof msg.speed === 'string' && SPEED_PRESETS[msg.speed] != null) {
this._applySpeed(msg.speed);
}
if (typeof msg.noiseGate === 'boolean') {
this._noiseGateEnabled = msg.noiseGate;
}
}
/**
* Recompute attack/release/silence-decay coefficients for
* the current sample rate.
*
* IMPORTANT: We use frames-per-second (sampleRate / 128), NOT
* the raw sampleRate, because the smoothing filter is applied
* once per render quantum — not once per sample.
*
* @param {'slow' | 'medium' | 'fast'} preset
*/
_applySpeed(preset) {
const { attack, release } = SPEED_PRESETS[preset];
const fps = sampleRate / RENDER_QUANTUM_FRAMES;
this._alphaAttack = timeConstantToAlpha(attack, fps);
this._alphaRelease = timeConstantToAlpha(release, fps);
this._alphaSilenceDecay = timeConstantToAlpha(SILENCE_DECAY_TC, fps);
}
/* ── DSP ────────────────────────────────────────────────────── */
/**
* Main audio processing callback.
*
* @param {Float32Array[][]} inputs Input channels.
* @param {Float32Array[][]} outputs Output channels.
* @returns {boolean} `true` to keep the processor alive.
*/
process(inputs, outputs) {
const input = inputs[0];
const output = outputs[0];
// No input → silence pass-through
if (!input || input.length === 0 || !input[0]) {
return true;
}
const inputChannel = input[0];
const outputChannel = output[0];
const numSamples = inputChannel.length;
// ── Bypass mode ──────────────────────────────────────────
if (!this._enabled) {
// Copy input → output unchanged
for (let i = 0; i < numSamples; i++) {
outputChannel[i] = inputChannel[i];
}
// Also copy any additional channels (stereo, etc.)
for (let ch = 1; ch < input.length; ch++) {
if (output[ch] && input[ch]) {
for (let i = 0; i < numSamples; i++) {
output[ch][i] = input[ch][i];
}
}
}
return true;
}
// ── 1. RMS level estimation ──────────────────────────────
//
// Compute the RMS of this render quantum and smooth it with
// a one-pole IIR filter (exponential moving average).
//
// We work in the squared domain to avoid a sqrt per sample;
// the sqrt is taken only once per quantum for the gain calc.
let sumSq = 0;
for (let i = 0; i < numSamples; i++) {
const s = inputChannel[i];
sumSq += s * s;
}
const frameMeanSq = sumSq / numSamples;
// Smooth envelope: use attack for rising levels, release for falling
const alpha = frameMeanSq > this._envelopeSq
? this._alphaAttack
: this._alphaRelease;
this._envelopeSq += alpha * (frameMeanSq - this._envelopeSq);
// Current smoothed RMS (linear)
const rms = Math.sqrt(Math.max(this._envelopeSq, 1e-12));
// ── 2. Silence gate ──────────────────────────────────────
//
// If the RMS is below the silence threshold, do NOT compute
// a new gain target. Instead, decay the current gain slowly
// toward unity (1.0) so we don't slam the listener when
// speech resumes.
const isSilence = rms < this._silenceThreshold;
if (isSilence && this._noiseGateEnabled) {
// Decay gain toward 1.0
this._currentGain += this._alphaSilenceDecay * (1.0 - this._currentGain);
} else if (!isSilence) {
// ── 3. Target gain computation ───────────────────────
//
// Desired gain = 10^((targetDbfs currentDbfs) / 20)
//
// We scale the correction by the strength factor so that
// "low" strength applies only 50 % of the correction.
const currentDbfs = linearToDb(rms);
const errorDb = this._targetDbfs - currentDbfs;
// Scale the correction by strength.
// A strength of 1.0 means "correct fully to target".
const correctionDb = errorDb * this._strength;
let desiredGain = dbToLinear(correctionDb);
// Clamp to max gain
if (desiredGain > this._maxGainLinear) {
desiredGain = this._maxGainLinear;
}
// Never attenuate below a certain floor (we're leveling UP,
// but very loud signals still need to be pulled down).
// Allow attenuation down to 6 dB.
if (desiredGain < 0.5) {
desiredGain = 0.5;
}
// ── 4. Gain smoothing ──────────────────────────────
//
// Exponentially interpolate the current gain toward the
// desired gain. Use fast attack (gain DOWN) and slow
// release (gain UP) for natural dynamics.
const gainAlpha = desiredGain < this._currentGain
? this._alphaAttack // Gain is decreasing (loud signal arrived)
: this._alphaRelease; // Gain is increasing (signal got quieter)
this._currentGain += gainAlpha * (desiredGain - this._currentGain);
}
// If isSilence && !noiseGateEnabled → gain stays as-is (frozen)
// ── 5. Apply gain & soft-clip ─────────────────────────────
const gain = this._currentGain;
for (let i = 0; i < numSamples; i++) {
outputChannel[i] = softClip(inputChannel[i] * gain);
}
// Copy any additional channels with same gain
for (let ch = 1; ch < input.length; ch++) {
if (output[ch] && input[ch]) {
for (let i = 0; i < numSamples; i++) {
output[ch][i] = softClip(input[ch][i] * gain);
}
}
}
return true;
}
}
registerProcessor(PROCESSOR_NAME, VoiceLevelingProcessor);