Voice Leveling (untested)

This commit is contained in:
2026-03-03 03:41:59 +01:00
parent cf91d77502
commit 8315df42fc
9 changed files with 1313 additions and 19 deletions

View File

@@ -0,0 +1,442 @@
/**
* VoiceLevelingProcessor — AudioWorkletProcessor that implements
* broadcast-grade per-speaker automatic gain control (AGC).
*
* ═══════════════════════════════════════════════════════════════════
* DSP DESIGN NOTES
* ═══════════════════════════════════════════════════════════════════
*
* This processor mimics WebRTC's Gain Controller 2 (AGC2) behaviour
* using a lightweight algorithm suitable for real-time voice in an
* AudioWorklet thread.
*
* Pipeline (per 128-sample render quantum ≈ 2.67 ms @ 48 kHz):
*
* 1. RMS level estimation (short-term envelope)
* 2. Silence gate (freeze gain when below noise floor)
* 3. Target gain compute (desired dBFS → linear gain)
* 4. Gain smoothing (exponential attack / release)
* 5. Max-gain clamp (prevent runaway boost)
* 6. Soft-clip limiter (prevent digital overs)
*
* Key properties:
* • No per-frame allocation — all buffers pre-allocated.
* • Synchronous processing — no message passing in hot path.
* • Uses Float32 throughout — native AudioWorklet format.
* • 128-sample quantum fits within 10 ms at 48 kHz (2.67 ms).
*
* The processor receives configuration via AudioWorkletNode.port
* messages and applies them on the next render quantum.
*
* ═══════════════════════════════════════════════════════════════════
*/
/* ──────────────────────────────────────────────────────────────── */
/* Constants */
/* ──────────────────────────────────────────────────────────────── */
/** Processor name registered with `registerProcessor`. */
const PROCESSOR_NAME = 'VoiceLevelingProcessor';
/**
* Web Audio render quantum size — the number of samples processed
* in each call to `process()`. The AudioWorklet spec mandates 128.
*/
const RENDER_QUANTUM_FRAMES = 128;
/**
* Minimum RMS level (linear) below which the input is considered
* silence. Gain is frozen/decayed when the signal is this quiet.
* Roughly 60 dBFS.
*/
const DEFAULT_SILENCE_THRESHOLD = 0.001;
/**
* The target RMS level in dBFS. 18 dBFS is a comfortable
* conversational loudness for headphone listening.
*/
const DEFAULT_TARGET_DBFS = -18;
/** Default maximum gain boost in dB. */
const DEFAULT_MAX_GAIN_DB = 12;
/** Soft-clip ceiling — prevents digital overs. */
const SOFT_CLIP_THRESHOLD = 0.95;
/**
* Speed presets: attack and release time constants (seconds).
*
* Attack = how fast gain *decreases* when a loud signal arrives.
* Release = how fast gain *increases* when the signal gets quieter.
*
* Asymmetric: fast attack prevents clipping, slow release sounds
* natural and avoids "pumping".
*/
const SPEED_PRESETS = {
slow: { attack: 0.015, release: 0.800 },
medium: { attack: 0.010, release: 0.400 },
fast: { attack: 0.005, release: 0.150 },
};
/**
* AGC strength presets: scale the computed gain adjustment.
* 1.0 = full correction toward target; lower = gentler leveling.
*/
const STRENGTH_PRESETS = {
low: 0.5,
medium: 0.75,
high: 1.0,
};
/**
* When silence is detected, the gain decays toward 1.0 (unity)
* at this rate (seconds). This prevents the gain from sitting at
* a huge value after long silence and then blasting when speech
* resumes.
*/
const SILENCE_DECAY_TC = 2.0;
/* ──────────────────────────────────────────────────────────────── */
/* Helpers */
/* ──────────────────────────────────────────────────────────────── */
/** Convert decibels to linear gain. */
function dbToLinear(db) {
return Math.pow(10, db / 20);
}
/** Convert linear amplitude to dBFS. Returns Infinity for 0. */
function linearToDb(linear) {
if (linear <= 0) return -Infinity;
return 20 * Math.log10(linear);
}
/**
* Compute the exponential smoothing coefficient (α) for a given
* time constant and **frame rate** (not sample rate!).
*
* Because the envelope / gain update runs once per render quantum
* (128 samples), the rate passed here must be frames-per-second
* (sampleRate / 128), NOT samples-per-second. Using the raw
* sampleRate would produce absurdly small α values, making the
* AGC appear frozen.
*
* α = 1 e^(1 / (tc * fps))
*
* Larger α → faster response.
*
* @param {number} tc Time constant in seconds.
* @param {number} fps Frame rate (render quanta per second).
* @returns {number} Smoothing coefficient (01).
*/
function timeConstantToAlpha(tc, fps) {
if (tc <= 0) return 1.0;
return 1.0 - Math.exp(-1.0 / (tc * fps));
}
/**
* Attempt to use SharedArrayBuffer for the envelope history if
* the environment supports it. Falls back to a regular
* Float32Array.
*
* @param {number} length Number of elements.
* @returns {Float32Array}
*/
function allocateBuffer(length) {
try {
if (typeof SharedArrayBuffer !== 'undefined') {
return new Float32Array(new SharedArrayBuffer(length * 4));
}
} catch { /* fall through */ }
return new Float32Array(length);
}
/**
* Soft-clip function (tanh-based) that prevents digital overs
* while preserving signal shape.
*
* Below the threshold the signal passes through unchanged.
* Above it, tanh compression is applied symmetrically.
*
* @param {number} sample Input sample.
* @returns {number} Clipped sample.
*/
function softClip(sample) {
const abs = Math.abs(sample);
if (abs <= SOFT_CLIP_THRESHOLD) return sample;
const sign = sample >= 0 ? 1 : -1;
// Map (threshold..∞) → (threshold..1) using tanh
const excess = (abs - SOFT_CLIP_THRESHOLD) / (1 - SOFT_CLIP_THRESHOLD);
return sign * (SOFT_CLIP_THRESHOLD + (1 - SOFT_CLIP_THRESHOLD) * Math.tanh(excess));
}
/* ──────────────────────────────────────────────────────────────── */
/* Processor */
/* ──────────────────────────────────────────────────────────────── */
class VoiceLevelingProcessor extends AudioWorkletProcessor {
/* ── State ──────────────────────────────────────────────────── */
/** Whether processing is enabled (bypass when false). */
_enabled = true;
/** Target loudness in dBFS. */
_targetDbfs = DEFAULT_TARGET_DBFS;
/** Maximum gain boost in dB. */
_maxGainDb = DEFAULT_MAX_GAIN_DB;
/** Linear ceiling for the gain multiplier. */
_maxGainLinear = dbToLinear(DEFAULT_MAX_GAIN_DB);
/** AGC strength factor (01). Scales the gain correction. */
_strength = STRENGTH_PRESETS.medium;
/** Whether the silence/noise gate is active. */
_noiseGateEnabled = false;
/** RMS threshold below which input is treated as silence. */
_silenceThreshold = DEFAULT_SILENCE_THRESHOLD;
/** Attack smoothing coefficient. */
_alphaAttack = 0;
/** Release smoothing coefficient. */
_alphaRelease = 0;
/** Silence decay smoothing coefficient. */
_alphaSilenceDecay = 0;
/**
* Running RMS envelope (squared, to avoid sqrt every frame).
* Smoothed with a one-pole filter.
*/
_envelopeSq = 0;
/** Current applied gain (linear). Smoothed toward target. */
_currentGain = 1.0;
/**
* Pre-allocated buffer used for RMS computation.
* Sized to the largest possible render quantum (128 samples).
*/
_scratchBuffer = allocateBuffer(128);
/* ── Constructor ────────────────────────────────────────────── */
constructor(options) {
super(options);
// Compute smoothing coefficients from default speed
this._applySpeed('medium');
// Listen for configuration changes from the main thread.
// Messages are consumed before the next render quantum.
this.port.onmessage = (event) => this._handleMessage(event.data);
}
/* ── Configuration ──────────────────────────────────────────── */
/**
* Handle a configuration message from the main thread.
*
* Accepted keys:
* enabled : boolean
* targetDbfs : number (-30 … -12)
* maxGainDb : number (3 … 20)
* strength : 'low' | 'medium' | 'high'
* speed : 'slow' | 'medium' | 'fast'
* noiseGate : boolean
*
* @param {object} msg
*/
_handleMessage(msg) {
if (msg == null || typeof msg !== 'object') return;
if (typeof msg.enabled === 'boolean') {
this._enabled = msg.enabled;
if (!msg.enabled) {
// Reset gain to unity on disable so re-enabling starts clean
this._currentGain = 1.0;
this._envelopeSq = 0;
}
}
if (typeof msg.targetDbfs === 'number') {
this._targetDbfs = Math.max(-30, Math.min(-12, msg.targetDbfs));
}
if (typeof msg.maxGainDb === 'number') {
const clamped = Math.max(3, Math.min(20, msg.maxGainDb));
this._maxGainDb = clamped;
this._maxGainLinear = dbToLinear(clamped);
}
if (typeof msg.strength === 'string' && STRENGTH_PRESETS[msg.strength] != null) {
this._strength = STRENGTH_PRESETS[msg.strength];
}
if (typeof msg.speed === 'string' && SPEED_PRESETS[msg.speed] != null) {
this._applySpeed(msg.speed);
}
if (typeof msg.noiseGate === 'boolean') {
this._noiseGateEnabled = msg.noiseGate;
}
}
/**
* Recompute attack/release/silence-decay coefficients for
* the current sample rate.
*
* IMPORTANT: We use frames-per-second (sampleRate / 128), NOT
* the raw sampleRate, because the smoothing filter is applied
* once per render quantum — not once per sample.
*
* @param {'slow' | 'medium' | 'fast'} preset
*/
_applySpeed(preset) {
const { attack, release } = SPEED_PRESETS[preset];
const fps = sampleRate / RENDER_QUANTUM_FRAMES;
this._alphaAttack = timeConstantToAlpha(attack, fps);
this._alphaRelease = timeConstantToAlpha(release, fps);
this._alphaSilenceDecay = timeConstantToAlpha(SILENCE_DECAY_TC, fps);
}
/* ── DSP ────────────────────────────────────────────────────── */
/**
* Main audio processing callback.
*
* @param {Float32Array[][]} inputs Input channels.
* @param {Float32Array[][]} outputs Output channels.
* @returns {boolean} `true` to keep the processor alive.
*/
process(inputs, outputs) {
const input = inputs[0];
const output = outputs[0];
// No input → silence pass-through
if (!input || input.length === 0 || !input[0]) {
return true;
}
const inputChannel = input[0];
const outputChannel = output[0];
const numSamples = inputChannel.length;
// ── Bypass mode ──────────────────────────────────────────
if (!this._enabled) {
// Copy input → output unchanged
for (let i = 0; i < numSamples; i++) {
outputChannel[i] = inputChannel[i];
}
// Also copy any additional channels (stereo, etc.)
for (let ch = 1; ch < input.length; ch++) {
if (output[ch] && input[ch]) {
for (let i = 0; i < numSamples; i++) {
output[ch][i] = input[ch][i];
}
}
}
return true;
}
// ── 1. RMS level estimation ──────────────────────────────
//
// Compute the RMS of this render quantum and smooth it with
// a one-pole IIR filter (exponential moving average).
//
// We work in the squared domain to avoid a sqrt per sample;
// the sqrt is taken only once per quantum for the gain calc.
let sumSq = 0;
for (let i = 0; i < numSamples; i++) {
const s = inputChannel[i];
sumSq += s * s;
}
const frameMeanSq = sumSq / numSamples;
// Smooth envelope: use attack for rising levels, release for falling
const alpha = frameMeanSq > this._envelopeSq
? this._alphaAttack
: this._alphaRelease;
this._envelopeSq += alpha * (frameMeanSq - this._envelopeSq);
// Current smoothed RMS (linear)
const rms = Math.sqrt(Math.max(this._envelopeSq, 1e-12));
// ── 2. Silence gate ──────────────────────────────────────
//
// If the RMS is below the silence threshold, do NOT compute
// a new gain target. Instead, decay the current gain slowly
// toward unity (1.0) so we don't slam the listener when
// speech resumes.
const isSilence = rms < this._silenceThreshold;
if (isSilence && this._noiseGateEnabled) {
// Decay gain toward 1.0
this._currentGain += this._alphaSilenceDecay * (1.0 - this._currentGain);
} else if (!isSilence) {
// ── 3. Target gain computation ───────────────────────
//
// Desired gain = 10^((targetDbfs currentDbfs) / 20)
//
// We scale the correction by the strength factor so that
// "low" strength applies only 50 % of the correction.
const currentDbfs = linearToDb(rms);
const errorDb = this._targetDbfs - currentDbfs;
// Scale the correction by strength.
// A strength of 1.0 means "correct fully to target".
const correctionDb = errorDb * this._strength;
let desiredGain = dbToLinear(correctionDb);
// Clamp to max gain
if (desiredGain > this._maxGainLinear) {
desiredGain = this._maxGainLinear;
}
// Never attenuate below a certain floor (we're leveling UP,
// but very loud signals still need to be pulled down).
// Allow attenuation down to 6 dB.
if (desiredGain < 0.5) {
desiredGain = 0.5;
}
// ── 4. Gain smoothing ──────────────────────────────
//
// Exponentially interpolate the current gain toward the
// desired gain. Use fast attack (gain DOWN) and slow
// release (gain UP) for natural dynamics.
const gainAlpha = desiredGain < this._currentGain
? this._alphaAttack // Gain is decreasing (loud signal arrived)
: this._alphaRelease; // Gain is increasing (signal got quieter)
this._currentGain += gainAlpha * (desiredGain - this._currentGain);
}
// If isSilence && !noiseGateEnabled → gain stays as-is (frozen)
// ── 5. Apply gain & soft-clip ─────────────────────────────
const gain = this._currentGain;
for (let i = 0; i < numSamples; i++) {
outputChannel[i] = softClip(inputChannel[i] * gain);
}
// Copy any additional channels with same gain
for (let ch = 1; ch < input.length; ch++) {
if (output[ch] && input[ch]) {
for (let i = 0; i < numSamples; i++) {
output[ch][i] = softClip(input[ch][i] * gain);
}
}
}
return true;
}
}
registerProcessor(PROCESSOR_NAME, VoiceLevelingProcessor);