Voice Leveling (untested)

2026-03-03 03:41:59 +01:00
parent cf91d77502
commit 8315df42fc
9 changed files with 1313 additions and 19 deletions
--- a/public/voice-leveling-worklet.js
+++ b/public/voice-leveling-worklet.js
@@ -0,0 +1,442 @@
+/**
+ * VoiceLevelingProcessor — AudioWorkletProcessor that implements
+ * broadcast-grade per-speaker automatic gain control (AGC).
+ *
+ * ═══════════════════════════════════════════════════════════════════
+ *  DSP DESIGN NOTES
+ * ═══════════════════════════════════════════════════════════════════
+ *
+ * This processor mimics WebRTC's Gain Controller 2 (AGC2) behaviour
+ * using a lightweight algorithm suitable for real-time voice in an
+ * AudioWorklet thread.
+ *
+ * Pipeline (per 128-sample render quantum ≈ 2.67 ms @ 48 kHz):
+ *
+ *   1. RMS level estimation  (short-term envelope)
+ *   2. Silence gate          (freeze gain when below noise floor)
+ *   3. Target gain compute   (desired dBFS → linear gain)
+ *   4. Gain smoothing        (exponential attack / release)
+ *   5. Max-gain clamp        (prevent runaway boost)
+ *   6. Soft-clip limiter     (prevent digital overs)
+ *
+ * Key properties:
+ *   • No per-frame allocation — all buffers pre-allocated.
+ *   • Synchronous processing — no message passing in hot path.
+ *   • Uses Float32 throughout — native AudioWorklet format.
+ *   • 128-sample quantum fits within 10 ms at 48 kHz (2.67 ms).
+ *
+ * The processor receives configuration via AudioWorkletNode.port
+ * messages and applies them on the next render quantum.
+ *
+ * ═══════════════════════════════════════════════════════════════════
+ */
+
+/* ──────────────────────────────────────────────────────────────── */
+/*  Constants                                                       */
+/* ──────────────────────────────────────────────────────────────── */
+
+/** Processor name registered with `registerProcessor`. */
+const PROCESSOR_NAME = 'VoiceLevelingProcessor';
+
+/**
+ * Web Audio render quantum size — the number of samples processed
+ * in each call to `process()`.  The AudioWorklet spec mandates 128.
+ */
+const RENDER_QUANTUM_FRAMES = 128;
+
+/**
+ * Minimum RMS level (linear) below which the input is considered
+ * silence. Gain is frozen/decayed when the signal is this quiet.
+ * Roughly −60 dBFS.
+ */
+const DEFAULT_SILENCE_THRESHOLD = 0.001;
+
+/**
+ * The target RMS level in dBFS.  −18 dBFS is a comfortable
+ * conversational loudness for headphone listening.
+ */
+const DEFAULT_TARGET_DBFS = -18;
+
+/** Default maximum gain boost in dB. */
+const DEFAULT_MAX_GAIN_DB = 12;
+
+/** Soft-clip ceiling — prevents digital overs. */
+const SOFT_CLIP_THRESHOLD = 0.95;
+
+/**
+ * Speed presets: attack and release time constants (seconds).
+ *
+ * Attack = how fast gain *decreases* when a loud signal arrives.
+ * Release = how fast gain *increases* when the signal gets quieter.
+ *
+ * Asymmetric: fast attack prevents clipping, slow release sounds
+ * natural and avoids "pumping".
+ */
+const SPEED_PRESETS = {
+  slow:   { attack: 0.015, release: 0.800 },
+  medium: { attack: 0.010, release: 0.400 },
+  fast:   { attack: 0.005, release: 0.150 },
+};
+
+/**
+ * AGC strength presets: scale the computed gain adjustment.
+ * 1.0 = full correction toward target; lower = gentler leveling.
+ */
+const STRENGTH_PRESETS = {
+  low:    0.5,
+  medium: 0.75,
+  high:   1.0,
+};
+
+/**
+ * When silence is detected, the gain decays toward 1.0 (unity)
+ * at this rate (seconds). This prevents the gain from sitting at
+ * a huge value after long silence and then blasting when speech
+ * resumes.
+ */
+const SILENCE_DECAY_TC = 2.0;
+
+/* ──────────────────────────────────────────────────────────────── */
+/*  Helpers                                                         */
+/* ──────────────────────────────────────────────────────────────── */
+
+/** Convert decibels to linear gain. */
+function dbToLinear(db) {
+  return Math.pow(10, db / 20);
+}
+
+/** Convert linear amplitude to dBFS. Returns −Infinity for 0. */
+function linearToDb(linear) {
+  if (linear <= 0) return -Infinity;
+  return 20 * Math.log10(linear);
+}
+
+/**
+ * Compute the exponential smoothing coefficient (α) for a given
+ * time constant and **frame rate** (not sample rate!).
+ *
+ * Because the envelope / gain update runs once per render quantum
+ * (128 samples), the rate passed here must be frames-per-second
+ * (sampleRate / 128), NOT samples-per-second.  Using the raw
+ * sampleRate would produce absurdly small α values, making the
+ * AGC appear frozen.
+ *
+ *   α = 1 − e^(−1 / (tc * fps))
+ *
+ * Larger α → faster response.
+ *
+ * @param {number} tc   Time constant in seconds.
+ * @param {number} fps  Frame rate (render quanta per second).
+ * @returns {number}     Smoothing coefficient (0–1).
+ */
+function timeConstantToAlpha(tc, fps) {
+  if (tc <= 0) return 1.0;
+  return 1.0 - Math.exp(-1.0 / (tc * fps));
+}
+
+/**
+ * Attempt to use SharedArrayBuffer for the envelope history if
+ * the environment supports it.  Falls back to a regular
+ * Float32Array.
+ *
+ * @param {number} length  Number of elements.
+ * @returns {Float32Array}
+ */
+function allocateBuffer(length) {
+  try {
+    if (typeof SharedArrayBuffer !== 'undefined') {
+      return new Float32Array(new SharedArrayBuffer(length * 4));
+    }
+  } catch { /* fall through */ }
+  return new Float32Array(length);
+}
+
+/**
+ * Soft-clip function (tanh-based) that prevents digital overs
+ * while preserving signal shape.
+ *
+ * Below the threshold the signal passes through unchanged.
+ * Above it, tanh compression is applied symmetrically.
+ *
+ * @param {number} sample  Input sample.
+ * @returns {number}        Clipped sample.
+ */
+function softClip(sample) {
+  const abs = Math.abs(sample);
+  if (abs <= SOFT_CLIP_THRESHOLD) return sample;
+  const sign = sample >= 0 ? 1 : -1;
+  // Map (threshold..∞) → (threshold..1) using tanh
+  const excess = (abs - SOFT_CLIP_THRESHOLD) / (1 - SOFT_CLIP_THRESHOLD);
+  return sign * (SOFT_CLIP_THRESHOLD + (1 - SOFT_CLIP_THRESHOLD) * Math.tanh(excess));
+}
+
+/* ──────────────────────────────────────────────────────────────── */
+/*  Processor                                                       */
+/* ──────────────────────────────────────────────────────────────── */
+
+class VoiceLevelingProcessor extends AudioWorkletProcessor {
+
+  /* ── State ──────────────────────────────────────────────────── */
+
+  /** Whether processing is enabled (bypass when false). */
+  _enabled = true;
+
+  /** Target loudness in dBFS. */
+  _targetDbfs = DEFAULT_TARGET_DBFS;
+
+  /** Maximum gain boost in dB. */
+  _maxGainDb = DEFAULT_MAX_GAIN_DB;
+
+  /** Linear ceiling for the gain multiplier. */
+  _maxGainLinear = dbToLinear(DEFAULT_MAX_GAIN_DB);
+
+  /** AGC strength factor (0–1). Scales the gain correction. */
+  _strength = STRENGTH_PRESETS.medium;
+
+  /** Whether the silence/noise gate is active. */
+  _noiseGateEnabled = false;
+
+  /** RMS threshold below which input is treated as silence. */
+  _silenceThreshold = DEFAULT_SILENCE_THRESHOLD;
+
+  /** Attack smoothing coefficient. */
+  _alphaAttack = 0;
+
+  /** Release smoothing coefficient. */
+  _alphaRelease = 0;
+
+  /** Silence decay smoothing coefficient. */
+  _alphaSilenceDecay = 0;
+
+  /**
+   * Running RMS envelope (squared, to avoid sqrt every frame).
+   * Smoothed with a one-pole filter.
+   */
+  _envelopeSq = 0;
+
+  /** Current applied gain (linear). Smoothed toward target. */
+  _currentGain = 1.0;
+
+  /**
+   * Pre-allocated buffer used for RMS computation.
+   * Sized to the largest possible render quantum (128 samples).
+   */
+  _scratchBuffer = allocateBuffer(128);
+
+  /* ── Constructor ────────────────────────────────────────────── */
+
+  constructor(options) {
+    super(options);
+
+    // Compute smoothing coefficients from default speed
+    this._applySpeed('medium');
+
+    // Listen for configuration changes from the main thread.
+    // Messages are consumed before the next render quantum.
+    this.port.onmessage = (event) => this._handleMessage(event.data);
+  }
+
+  /* ── Configuration ──────────────────────────────────────────── */
+
+  /**
+   * Handle a configuration message from the main thread.
+   *
+   * Accepted keys:
+   *   enabled        : boolean
+   *   targetDbfs     : number  (-30 … -12)
+   *   maxGainDb      : number  (3 … 20)
+   *   strength       : 'low' | 'medium' | 'high'
+   *   speed          : 'slow' | 'medium' | 'fast'
+   *   noiseGate      : boolean
+   *
+   * @param {object} msg
+   */
+  _handleMessage(msg) {
+    if (msg == null || typeof msg !== 'object') return;
+
+    if (typeof msg.enabled === 'boolean') {
+      this._enabled = msg.enabled;
+      if (!msg.enabled) {
+        // Reset gain to unity on disable so re-enabling starts clean
+        this._currentGain = 1.0;
+        this._envelopeSq = 0;
+      }
+    }
+
+    if (typeof msg.targetDbfs === 'number') {
+      this._targetDbfs = Math.max(-30, Math.min(-12, msg.targetDbfs));
+    }
+
+    if (typeof msg.maxGainDb === 'number') {
+      const clamped = Math.max(3, Math.min(20, msg.maxGainDb));
+      this._maxGainDb = clamped;
+      this._maxGainLinear = dbToLinear(clamped);
+    }
+
+    if (typeof msg.strength === 'string' && STRENGTH_PRESETS[msg.strength] != null) {
+      this._strength = STRENGTH_PRESETS[msg.strength];
+    }
+
+    if (typeof msg.speed === 'string' && SPEED_PRESETS[msg.speed] != null) {
+      this._applySpeed(msg.speed);
+    }
+
+    if (typeof msg.noiseGate === 'boolean') {
+      this._noiseGateEnabled = msg.noiseGate;
+    }
+  }
+
+  /**
+   * Recompute attack/release/silence-decay coefficients for
+   * the current sample rate.
+   *
+   * IMPORTANT: We use frames-per-second (sampleRate / 128), NOT
+   * the raw sampleRate, because the smoothing filter is applied
+   * once per render quantum — not once per sample.
+   *
+   * @param {'slow' | 'medium' | 'fast'} preset
+   */
+  _applySpeed(preset) {
+    const { attack, release } = SPEED_PRESETS[preset];
+    const fps = sampleRate / RENDER_QUANTUM_FRAMES;
+    this._alphaAttack = timeConstantToAlpha(attack, fps);
+    this._alphaRelease = timeConstantToAlpha(release, fps);
+    this._alphaSilenceDecay = timeConstantToAlpha(SILENCE_DECAY_TC, fps);
+  }
+
+  /* ── DSP ────────────────────────────────────────────────────── */
+
+  /**
+   * Main audio processing callback.
+   *
+   * @param {Float32Array[][]} inputs   Input channels.
+   * @param {Float32Array[][]} outputs  Output channels.
+   * @returns {boolean}  `true` to keep the processor alive.
+   */
+  process(inputs, outputs) {
+    const input = inputs[0];
+    const output = outputs[0];
+
+    // No input → silence pass-through
+    if (!input || input.length === 0 || !input[0]) {
+      return true;
+    }
+
+    const inputChannel = input[0];
+    const outputChannel = output[0];
+    const numSamples = inputChannel.length;
+
+    // ── Bypass mode ──────────────────────────────────────────
+    if (!this._enabled) {
+      // Copy input → output unchanged
+      for (let i = 0; i < numSamples; i++) {
+        outputChannel[i] = inputChannel[i];
+      }
+      // Also copy any additional channels (stereo, etc.)
+      for (let ch = 1; ch < input.length; ch++) {
+        if (output[ch] && input[ch]) {
+          for (let i = 0; i < numSamples; i++) {
+            output[ch][i] = input[ch][i];
+          }
+        }
+      }
+      return true;
+    }
+
+    // ── 1. RMS level estimation ──────────────────────────────
+    //
+    // Compute the RMS of this render quantum and smooth it with
+    // a one-pole IIR filter (exponential moving average).
+    //
+    // We work in the squared domain to avoid a sqrt per sample;
+    // the sqrt is taken only once per quantum for the gain calc.
+
+    let sumSq = 0;
+    for (let i = 0; i < numSamples; i++) {
+      const s = inputChannel[i];
+      sumSq += s * s;
+    }
+    const frameMeanSq = sumSq / numSamples;
+
+    // Smooth envelope: use attack for rising levels, release for falling
+    const alpha = frameMeanSq > this._envelopeSq
+      ? this._alphaAttack
+      : this._alphaRelease;
+    this._envelopeSq += alpha * (frameMeanSq - this._envelopeSq);
+
+    // Current smoothed RMS (linear)
+    const rms = Math.sqrt(Math.max(this._envelopeSq, 1e-12));
+
+    // ── 2. Silence gate ──────────────────────────────────────
+    //
+    // If the RMS is below the silence threshold, do NOT compute
+    // a new gain target. Instead, decay the current gain slowly
+    // toward unity (1.0) so we don't slam the listener when
+    // speech resumes.
+
+    const isSilence = rms < this._silenceThreshold;
+
+    if (isSilence && this._noiseGateEnabled) {
+      // Decay gain toward 1.0
+      this._currentGain += this._alphaSilenceDecay * (1.0 - this._currentGain);
+    } else if (!isSilence) {
+      // ── 3. Target gain computation ───────────────────────
+      //
+      // Desired gain = 10^((targetDbfs − currentDbfs) / 20)
+      //
+      // We scale the correction by the strength factor so that
+      // "low" strength applies only 50 % of the correction.
+
+      const currentDbfs = linearToDb(rms);
+      const errorDb = this._targetDbfs - currentDbfs;
+
+      // Scale the correction by strength.
+      // A strength of 1.0 means "correct fully to target".
+      const correctionDb = errorDb * this._strength;
+      let desiredGain = dbToLinear(correctionDb);
+
+      // Clamp to max gain
+      if (desiredGain > this._maxGainLinear) {
+        desiredGain = this._maxGainLinear;
+      }
+      // Never attenuate below a certain floor (we're leveling UP,
+      // but very loud signals still need to be pulled down).
+      // Allow attenuation down to −6 dB.
+      if (desiredGain < 0.5) {
+        desiredGain = 0.5;
+      }
+
+      // ── 4. Gain smoothing ──────────────────────────────
+      //
+      // Exponentially interpolate the current gain toward the
+      // desired gain. Use fast attack (gain DOWN) and slow
+      // release (gain UP) for natural dynamics.
+
+      const gainAlpha = desiredGain < this._currentGain
+        ? this._alphaAttack   // Gain is decreasing (loud signal arrived)
+        : this._alphaRelease; // Gain is increasing (signal got quieter)
+
+      this._currentGain += gainAlpha * (desiredGain - this._currentGain);
+    }
+    // If isSilence && !noiseGateEnabled → gain stays as-is (frozen)
+
+    // ── 5. Apply gain & soft-clip ─────────────────────────────
+    const gain = this._currentGain;
+    for (let i = 0; i < numSamples; i++) {
+      outputChannel[i] = softClip(inputChannel[i] * gain);
+    }
+
+    // Copy any additional channels with same gain
+    for (let ch = 1; ch < input.length; ch++) {
+      if (output[ch] && input[ch]) {
+        for (let i = 0; i < numSamples; i++) {
+          output[ch][i] = softClip(input[ch][i] * gain);
+        }
+      }
+    }
+
+    return true;
+  }
+}
+
+registerProcessor(PROCESSOR_NAME, VoiceLevelingProcessor);