443 lines
16 KiB
JavaScript
443 lines
16 KiB
JavaScript
/**
|
||
* VoiceLevelingProcessor — AudioWorkletProcessor that implements
|
||
* broadcast-grade per-speaker automatic gain control (AGC).
|
||
*
|
||
* ═══════════════════════════════════════════════════════════════════
|
||
* DSP DESIGN NOTES
|
||
* ═══════════════════════════════════════════════════════════════════
|
||
*
|
||
* This processor mimics WebRTC's Gain Controller 2 (AGC2) behaviour
|
||
* using a lightweight algorithm suitable for real-time voice in an
|
||
* AudioWorklet thread.
|
||
*
|
||
* Pipeline (per 128-sample render quantum ≈ 2.67 ms @ 48 kHz):
|
||
*
|
||
* 1. RMS level estimation (short-term envelope)
|
||
* 2. Silence gate (freeze gain when below noise floor)
|
||
* 3. Target gain compute (desired dBFS → linear gain)
|
||
* 4. Gain smoothing (exponential attack / release)
|
||
* 5. Max-gain clamp (prevent runaway boost)
|
||
* 6. Soft-clip limiter (prevent digital overs)
|
||
*
|
||
* Key properties:
|
||
* • No per-frame allocation — all buffers pre-allocated.
|
||
* • Synchronous processing — no message passing in hot path.
|
||
* • Uses Float32 throughout — native AudioWorklet format.
|
||
* • 128-sample quantum fits within 10 ms at 48 kHz (2.67 ms).
|
||
*
|
||
* The processor receives configuration via AudioWorkletNode.port
|
||
* messages and applies them on the next render quantum.
|
||
*
|
||
* ═══════════════════════════════════════════════════════════════════
|
||
*/
|
||
|
||
/* ──────────────────────────────────────────────────────────────── */
|
||
/* Constants */
|
||
/* ──────────────────────────────────────────────────────────────── */
|
||
|
||
/** Processor name registered with `registerProcessor`. */
|
||
const PROCESSOR_NAME = 'VoiceLevelingProcessor';
|
||
|
||
/**
|
||
* Web Audio render quantum size — the number of samples processed
|
||
* in each call to `process()`. The AudioWorklet spec mandates 128.
|
||
*/
|
||
const RENDER_QUANTUM_FRAMES = 128;
|
||
|
||
/**
|
||
* Minimum RMS level (linear) below which the input is considered
|
||
* silence. Gain is frozen/decayed when the signal is this quiet.
|
||
* Roughly −60 dBFS.
|
||
*/
|
||
const DEFAULT_SILENCE_THRESHOLD = 0.001;
|
||
|
||
/**
|
||
* The target RMS level in dBFS. −18 dBFS is a comfortable
|
||
* conversational loudness for headphone listening.
|
||
*/
|
||
const DEFAULT_TARGET_DBFS = -18;
|
||
|
||
/** Default maximum gain boost in dB. */
|
||
const DEFAULT_MAX_GAIN_DB = 12;
|
||
|
||
/** Soft-clip ceiling — prevents digital overs. */
|
||
const SOFT_CLIP_THRESHOLD = 0.95;
|
||
|
||
/**
|
||
* Speed presets: attack and release time constants (seconds).
|
||
*
|
||
* Attack = how fast gain *decreases* when a loud signal arrives.
|
||
* Release = how fast gain *increases* when the signal gets quieter.
|
||
*
|
||
* Asymmetric: fast attack prevents clipping, slow release sounds
|
||
* natural and avoids "pumping".
|
||
*/
|
||
const SPEED_PRESETS = {
|
||
slow: { attack: 0.015, release: 0.800 },
|
||
medium: { attack: 0.010, release: 0.400 },
|
||
fast: { attack: 0.005, release: 0.150 },
|
||
};
|
||
|
||
/**
|
||
* AGC strength presets: scale the computed gain adjustment.
|
||
* 1.0 = full correction toward target; lower = gentler leveling.
|
||
*/
|
||
const STRENGTH_PRESETS = {
|
||
low: 0.5,
|
||
medium: 0.75,
|
||
high: 1.0,
|
||
};
|
||
|
||
/**
|
||
* When silence is detected, the gain decays toward 1.0 (unity)
|
||
* at this rate (seconds). This prevents the gain from sitting at
|
||
* a huge value after long silence and then blasting when speech
|
||
* resumes.
|
||
*/
|
||
const SILENCE_DECAY_TC = 2.0;
|
||
|
||
/* ──────────────────────────────────────────────────────────────── */
|
||
/* Helpers */
|
||
/* ──────────────────────────────────────────────────────────────── */
|
||
|
||
/** Convert decibels to linear gain. */
|
||
function dbToLinear(db) {
|
||
return Math.pow(10, db / 20);
|
||
}
|
||
|
||
/** Convert linear amplitude to dBFS. Returns −Infinity for 0. */
|
||
function linearToDb(linear) {
|
||
if (linear <= 0) return -Infinity;
|
||
return 20 * Math.log10(linear);
|
||
}
|
||
|
||
/**
|
||
* Compute the exponential smoothing coefficient (α) for a given
|
||
* time constant and **frame rate** (not sample rate!).
|
||
*
|
||
* Because the envelope / gain update runs once per render quantum
|
||
* (128 samples), the rate passed here must be frames-per-second
|
||
* (sampleRate / 128), NOT samples-per-second. Using the raw
|
||
* sampleRate would produce absurdly small α values, making the
|
||
* AGC appear frozen.
|
||
*
|
||
* α = 1 − e^(−1 / (tc * fps))
|
||
*
|
||
* Larger α → faster response.
|
||
*
|
||
* @param {number} tc Time constant in seconds.
|
||
* @param {number} fps Frame rate (render quanta per second).
|
||
* @returns {number} Smoothing coefficient (0–1).
|
||
*/
|
||
function timeConstantToAlpha(tc, fps) {
|
||
if (tc <= 0) return 1.0;
|
||
return 1.0 - Math.exp(-1.0 / (tc * fps));
|
||
}
|
||
|
||
/**
|
||
* Attempt to use SharedArrayBuffer for the envelope history if
|
||
* the environment supports it. Falls back to a regular
|
||
* Float32Array.
|
||
*
|
||
* @param {number} length Number of elements.
|
||
* @returns {Float32Array}
|
||
*/
|
||
function allocateBuffer(length) {
|
||
try {
|
||
if (typeof SharedArrayBuffer !== 'undefined') {
|
||
return new Float32Array(new SharedArrayBuffer(length * 4));
|
||
}
|
||
} catch { /* fall through */ }
|
||
return new Float32Array(length);
|
||
}
|
||
|
||
/**
|
||
* Soft-clip function (tanh-based) that prevents digital overs
|
||
* while preserving signal shape.
|
||
*
|
||
* Below the threshold the signal passes through unchanged.
|
||
* Above it, tanh compression is applied symmetrically.
|
||
*
|
||
* @param {number} sample Input sample.
|
||
* @returns {number} Clipped sample.
|
||
*/
|
||
function softClip(sample) {
|
||
const abs = Math.abs(sample);
|
||
if (abs <= SOFT_CLIP_THRESHOLD) return sample;
|
||
const sign = sample >= 0 ? 1 : -1;
|
||
// Map (threshold..∞) → (threshold..1) using tanh
|
||
const excess = (abs - SOFT_CLIP_THRESHOLD) / (1 - SOFT_CLIP_THRESHOLD);
|
||
return sign * (SOFT_CLIP_THRESHOLD + (1 - SOFT_CLIP_THRESHOLD) * Math.tanh(excess));
|
||
}
|
||
|
||
/* ──────────────────────────────────────────────────────────────── */
|
||
/* Processor */
|
||
/* ──────────────────────────────────────────────────────────────── */
|
||
|
||
class VoiceLevelingProcessor extends AudioWorkletProcessor {
|
||
|
||
/* ── State ──────────────────────────────────────────────────── */
|
||
|
||
/** Whether processing is enabled (bypass when false). */
|
||
_enabled = true;
|
||
|
||
/** Target loudness in dBFS. */
|
||
_targetDbfs = DEFAULT_TARGET_DBFS;
|
||
|
||
/** Maximum gain boost in dB. */
|
||
_maxGainDb = DEFAULT_MAX_GAIN_DB;
|
||
|
||
/** Linear ceiling for the gain multiplier. */
|
||
_maxGainLinear = dbToLinear(DEFAULT_MAX_GAIN_DB);
|
||
|
||
/** AGC strength factor (0–1). Scales the gain correction. */
|
||
_strength = STRENGTH_PRESETS.medium;
|
||
|
||
/** Whether the silence/noise gate is active. */
|
||
_noiseGateEnabled = false;
|
||
|
||
/** RMS threshold below which input is treated as silence. */
|
||
_silenceThreshold = DEFAULT_SILENCE_THRESHOLD;
|
||
|
||
/** Attack smoothing coefficient. */
|
||
_alphaAttack = 0;
|
||
|
||
/** Release smoothing coefficient. */
|
||
_alphaRelease = 0;
|
||
|
||
/** Silence decay smoothing coefficient. */
|
||
_alphaSilenceDecay = 0;
|
||
|
||
/**
|
||
* Running RMS envelope (squared, to avoid sqrt every frame).
|
||
* Smoothed with a one-pole filter.
|
||
*/
|
||
_envelopeSq = 0;
|
||
|
||
/** Current applied gain (linear). Smoothed toward target. */
|
||
_currentGain = 1.0;
|
||
|
||
/**
|
||
* Pre-allocated buffer used for RMS computation.
|
||
* Sized to the largest possible render quantum (128 samples).
|
||
*/
|
||
_scratchBuffer = allocateBuffer(128);
|
||
|
||
/* ── Constructor ────────────────────────────────────────────── */
|
||
|
||
constructor(options) {
|
||
super(options);
|
||
|
||
// Compute smoothing coefficients from default speed
|
||
this._applySpeed('medium');
|
||
|
||
// Listen for configuration changes from the main thread.
|
||
// Messages are consumed before the next render quantum.
|
||
this.port.onmessage = (event) => this._handleMessage(event.data);
|
||
}
|
||
|
||
/* ── Configuration ──────────────────────────────────────────── */
|
||
|
||
/**
|
||
* Handle a configuration message from the main thread.
|
||
*
|
||
* Accepted keys:
|
||
* enabled : boolean
|
||
* targetDbfs : number (-30 … -12)
|
||
* maxGainDb : number (3 … 20)
|
||
* strength : 'low' | 'medium' | 'high'
|
||
* speed : 'slow' | 'medium' | 'fast'
|
||
* noiseGate : boolean
|
||
*
|
||
* @param {object} msg
|
||
*/
|
||
_handleMessage(msg) {
|
||
if (msg == null || typeof msg !== 'object') return;
|
||
|
||
if (typeof msg.enabled === 'boolean') {
|
||
this._enabled = msg.enabled;
|
||
if (!msg.enabled) {
|
||
// Reset gain to unity on disable so re-enabling starts clean
|
||
this._currentGain = 1.0;
|
||
this._envelopeSq = 0;
|
||
}
|
||
}
|
||
|
||
if (typeof msg.targetDbfs === 'number') {
|
||
this._targetDbfs = Math.max(-30, Math.min(-12, msg.targetDbfs));
|
||
}
|
||
|
||
if (typeof msg.maxGainDb === 'number') {
|
||
const clamped = Math.max(3, Math.min(20, msg.maxGainDb));
|
||
this._maxGainDb = clamped;
|
||
this._maxGainLinear = dbToLinear(clamped);
|
||
}
|
||
|
||
if (typeof msg.strength === 'string' && STRENGTH_PRESETS[msg.strength] != null) {
|
||
this._strength = STRENGTH_PRESETS[msg.strength];
|
||
}
|
||
|
||
if (typeof msg.speed === 'string' && SPEED_PRESETS[msg.speed] != null) {
|
||
this._applySpeed(msg.speed);
|
||
}
|
||
|
||
if (typeof msg.noiseGate === 'boolean') {
|
||
this._noiseGateEnabled = msg.noiseGate;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Recompute attack/release/silence-decay coefficients for
|
||
* the current sample rate.
|
||
*
|
||
* IMPORTANT: We use frames-per-second (sampleRate / 128), NOT
|
||
* the raw sampleRate, because the smoothing filter is applied
|
||
* once per render quantum — not once per sample.
|
||
*
|
||
* @param {'slow' | 'medium' | 'fast'} preset
|
||
*/
|
||
_applySpeed(preset) {
|
||
const { attack, release } = SPEED_PRESETS[preset];
|
||
const fps = sampleRate / RENDER_QUANTUM_FRAMES;
|
||
this._alphaAttack = timeConstantToAlpha(attack, fps);
|
||
this._alphaRelease = timeConstantToAlpha(release, fps);
|
||
this._alphaSilenceDecay = timeConstantToAlpha(SILENCE_DECAY_TC, fps);
|
||
}
|
||
|
||
/* ── DSP ────────────────────────────────────────────────────── */
|
||
|
||
/**
|
||
* Main audio processing callback.
|
||
*
|
||
* @param {Float32Array[][]} inputs Input channels.
|
||
* @param {Float32Array[][]} outputs Output channels.
|
||
* @returns {boolean} `true` to keep the processor alive.
|
||
*/
|
||
process(inputs, outputs) {
|
||
const input = inputs[0];
|
||
const output = outputs[0];
|
||
|
||
// No input → silence pass-through
|
||
if (!input || input.length === 0 || !input[0]) {
|
||
return true;
|
||
}
|
||
|
||
const inputChannel = input[0];
|
||
const outputChannel = output[0];
|
||
const numSamples = inputChannel.length;
|
||
|
||
// ── Bypass mode ──────────────────────────────────────────
|
||
if (!this._enabled) {
|
||
// Copy input → output unchanged
|
||
for (let i = 0; i < numSamples; i++) {
|
||
outputChannel[i] = inputChannel[i];
|
||
}
|
||
// Also copy any additional channels (stereo, etc.)
|
||
for (let ch = 1; ch < input.length; ch++) {
|
||
if (output[ch] && input[ch]) {
|
||
for (let i = 0; i < numSamples; i++) {
|
||
output[ch][i] = input[ch][i];
|
||
}
|
||
}
|
||
}
|
||
return true;
|
||
}
|
||
|
||
// ── 1. RMS level estimation ──────────────────────────────
|
||
//
|
||
// Compute the RMS of this render quantum and smooth it with
|
||
// a one-pole IIR filter (exponential moving average).
|
||
//
|
||
// We work in the squared domain to avoid a sqrt per sample;
|
||
// the sqrt is taken only once per quantum for the gain calc.
|
||
|
||
let sumSq = 0;
|
||
for (let i = 0; i < numSamples; i++) {
|
||
const s = inputChannel[i];
|
||
sumSq += s * s;
|
||
}
|
||
const frameMeanSq = sumSq / numSamples;
|
||
|
||
// Smooth envelope: use attack for rising levels, release for falling
|
||
const alpha = frameMeanSq > this._envelopeSq
|
||
? this._alphaAttack
|
||
: this._alphaRelease;
|
||
this._envelopeSq += alpha * (frameMeanSq - this._envelopeSq);
|
||
|
||
// Current smoothed RMS (linear)
|
||
const rms = Math.sqrt(Math.max(this._envelopeSq, 1e-12));
|
||
|
||
// ── 2. Silence gate ──────────────────────────────────────
|
||
//
|
||
// If the RMS is below the silence threshold, do NOT compute
|
||
// a new gain target. Instead, decay the current gain slowly
|
||
// toward unity (1.0) so we don't slam the listener when
|
||
// speech resumes.
|
||
|
||
const isSilence = rms < this._silenceThreshold;
|
||
|
||
if (isSilence && this._noiseGateEnabled) {
|
||
// Decay gain toward 1.0
|
||
this._currentGain += this._alphaSilenceDecay * (1.0 - this._currentGain);
|
||
} else if (!isSilence) {
|
||
// ── 3. Target gain computation ───────────────────────
|
||
//
|
||
// Desired gain = 10^((targetDbfs − currentDbfs) / 20)
|
||
//
|
||
// We scale the correction by the strength factor so that
|
||
// "low" strength applies only 50 % of the correction.
|
||
|
||
const currentDbfs = linearToDb(rms);
|
||
const errorDb = this._targetDbfs - currentDbfs;
|
||
|
||
// Scale the correction by strength.
|
||
// A strength of 1.0 means "correct fully to target".
|
||
const correctionDb = errorDb * this._strength;
|
||
let desiredGain = dbToLinear(correctionDb);
|
||
|
||
// Clamp to max gain
|
||
if (desiredGain > this._maxGainLinear) {
|
||
desiredGain = this._maxGainLinear;
|
||
}
|
||
// Never attenuate below a certain floor (we're leveling UP,
|
||
// but very loud signals still need to be pulled down).
|
||
// Allow attenuation down to −6 dB.
|
||
if (desiredGain < 0.5) {
|
||
desiredGain = 0.5;
|
||
}
|
||
|
||
// ── 4. Gain smoothing ──────────────────────────────
|
||
//
|
||
// Exponentially interpolate the current gain toward the
|
||
// desired gain. Use fast attack (gain DOWN) and slow
|
||
// release (gain UP) for natural dynamics.
|
||
|
||
const gainAlpha = desiredGain < this._currentGain
|
||
? this._alphaAttack // Gain is decreasing (loud signal arrived)
|
||
: this._alphaRelease; // Gain is increasing (signal got quieter)
|
||
|
||
this._currentGain += gainAlpha * (desiredGain - this._currentGain);
|
||
}
|
||
// If isSilence && !noiseGateEnabled → gain stays as-is (frozen)
|
||
|
||
// ── 5. Apply gain & soft-clip ─────────────────────────────
|
||
const gain = this._currentGain;
|
||
for (let i = 0; i < numSamples; i++) {
|
||
outputChannel[i] = softClip(inputChannel[i] * gain);
|
||
}
|
||
|
||
// Copy any additional channels with same gain
|
||
for (let ch = 1; ch < input.length; ch++) {
|
||
if (output[ch] && input[ch]) {
|
||
for (let i = 0; i < numSamples; i++) {
|
||
output[ch][i] = softClip(input[ch][i] * gain);
|
||
}
|
||
}
|
||
}
|
||
|
||
return true;
|
||
}
|
||
}
|
||
|
||
registerProcessor(PROCESSOR_NAME, VoiceLevelingProcessor);
|