Toju/public/voice-leveling-worklet.js

/**
 * VoiceLevelingProcessor — AudioWorkletProcessor that implements
 * broadcast-grade per-speaker automatic gain control (AGC).
 *
 * ═══════════════════════════════════════════════════════════════════
 *  DSP DESIGN NOTES
 * ═══════════════════════════════════════════════════════════════════
 *
 * This processor mimics WebRTC's Gain Controller 2 (AGC2) behaviour
 * using a lightweight algorithm suitable for real-time voice in an
 * AudioWorklet thread.
 *
 * Pipeline (per 128-sample render quantum ≈ 2.67 ms @ 48 kHz):
 *
 *   1. RMS level estimation  (short-term envelope)
 *   2. Silence gate          (freeze gain when below noise floor)
 *   3. Target gain compute   (desired dBFS → linear gain)
 *   4. Gain smoothing        (exponential attack / release)
 *   5. Max-gain clamp        (prevent runaway boost)
 *   6. Soft-clip limiter     (prevent digital overs)
 *
 * Key properties:
 *   • No per-frame allocation — all buffers pre-allocated.
 *   • Synchronous processing — no message passing in hot path.
 *   • Uses Float32 throughout — native AudioWorklet format.
 *   • 128-sample quantum fits within 10 ms at 48 kHz (2.67 ms).
 *
 * The processor receives configuration via AudioWorkletNode.port
 * messages and applies them on the next render quantum.
 *
 * ═══════════════════════════════════════════════════════════════════
 */

/* ──────────────────────────────────────────────────────────────── */
/*  Constants                                                       */
/* ──────────────────────────────────────────────────────────────── */

/** Processor name registered with `registerProcessor`. */
const PROCESSOR_NAME = 'VoiceLevelingProcessor';

/**
 * Web Audio render quantum size — the number of samples processed
 * in each call to `process()`.  The AudioWorklet spec mandates 128.
 */
const RENDER_QUANTUM_FRAMES = 128;

/**
 * Minimum RMS level (linear) below which the input is considered
 * silence. Gain is frozen/decayed when the signal is this quiet.
 * Roughly −60 dBFS.
 */
const DEFAULT_SILENCE_THRESHOLD = 0.001;

/**
 * The target RMS level in dBFS.  −18 dBFS is a comfortable
 * conversational loudness for headphone listening.
 */
const DEFAULT_TARGET_DBFS = -18;

/** Default maximum gain boost in dB. */
const DEFAULT_MAX_GAIN_DB = 12;

/** Soft-clip ceiling — prevents digital overs. */
const SOFT_CLIP_THRESHOLD = 0.95;

/**
 * Speed presets: attack and release time constants (seconds).
 *
 * Attack = how fast gain *decreases* when a loud signal arrives.
 * Release = how fast gain *increases* when the signal gets quieter.
 *
 * Asymmetric: fast attack prevents clipping, slow release sounds
 * natural and avoids "pumping".
 */
const SPEED_PRESETS = {
  slow:   { attack: 0.015, release: 0.800 },
  medium: { attack: 0.010, release: 0.400 },
  fast:   { attack: 0.005, release: 0.150 },
};

/**
 * AGC strength presets: scale the computed gain adjustment.
 * 1.0 = full correction toward target; lower = gentler leveling.
 */
const STRENGTH_PRESETS = {
  low:    0.5,
  medium: 0.75,
  high:   1.0,
};

/**
 * When silence is detected, the gain decays toward 1.0 (unity)
 * at this rate (seconds). This prevents the gain from sitting at
 * a huge value after long silence and then blasting when speech
 * resumes.
 */
const SILENCE_DECAY_TC = 2.0;

/* ──────────────────────────────────────────────────────────────── */
/*  Helpers                                                         */
/* ──────────────────────────────────────────────────────────────── */

/** Convert decibels to linear gain. */
function dbToLinear(db) {
  return Math.pow(10, db / 20);
}

/** Convert linear amplitude to dBFS. Returns −Infinity for 0. */
function linearToDb(linear) {
  if (linear <= 0) return -Infinity;
  return 20 * Math.log10(linear);
}

/**
 * Compute the exponential smoothing coefficient (α) for a given
 * time constant and **frame rate** (not sample rate!).
 *
 * Because the envelope / gain update runs once per render quantum
 * (128 samples), the rate passed here must be frames-per-second
 * (sampleRate / 128), NOT samples-per-second.  Using the raw
 * sampleRate would produce absurdly small α values, making the
 * AGC appear frozen.
 *
 *   α = 1 − e^(−1 / (tc * fps))
 *
 * Larger α → faster response.
 *
 * @param {number} tc   Time constant in seconds.
 * @param {number} fps  Frame rate (render quanta per second).
 * @returns {number}     Smoothing coefficient (0–1).
 */
function timeConstantToAlpha(tc, fps) {
  if (tc <= 0) return 1.0;
  return 1.0 - Math.exp(-1.0 / (tc * fps));
}

/**
 * Attempt to use SharedArrayBuffer for the envelope history if
 * the environment supports it.  Falls back to a regular
 * Float32Array.
 *
 * @param {number} length  Number of elements.
 * @returns {Float32Array}
 */
function allocateBuffer(length) {
  try {
    if (typeof SharedArrayBuffer !== 'undefined') {
      return new Float32Array(new SharedArrayBuffer(length * 4));
    }
  } catch { /* fall through */ }
  return new Float32Array(length);
}

/**
 * Soft-clip function (tanh-based) that prevents digital overs
 * while preserving signal shape.
 *
 * Below the threshold the signal passes through unchanged.
 * Above it, tanh compression is applied symmetrically.
 *
 * @param {number} sample  Input sample.
 * @returns {number}        Clipped sample.
 */
function softClip(sample) {
  const abs = Math.abs(sample);
  if (abs <= SOFT_CLIP_THRESHOLD) return sample;
  const sign = sample >= 0 ? 1 : -1;
  // Map (threshold..∞) → (threshold..1) using tanh
  const excess = (abs - SOFT_CLIP_THRESHOLD) / (1 - SOFT_CLIP_THRESHOLD);
  return sign * (SOFT_CLIP_THRESHOLD + (1 - SOFT_CLIP_THRESHOLD) * Math.tanh(excess));
}

/* ──────────────────────────────────────────────────────────────── */
/*  Processor                                                       */
/* ──────────────────────────────────────────────────────────────── */

class VoiceLevelingProcessor extends AudioWorkletProcessor {

  /* ── State ──────────────────────────────────────────────────── */

  /** Whether processing is enabled (bypass when false). */
  _enabled = true;

  /** Target loudness in dBFS. */
  _targetDbfs = DEFAULT_TARGET_DBFS;

  /** Maximum gain boost in dB. */
  _maxGainDb = DEFAULT_MAX_GAIN_DB;

  /** Linear ceiling for the gain multiplier. */
  _maxGainLinear = dbToLinear(DEFAULT_MAX_GAIN_DB);

  /** AGC strength factor (0–1). Scales the gain correction. */
  _strength = STRENGTH_PRESETS.medium;

  /** Whether the silence/noise gate is active. */
  _noiseGateEnabled = false;

  /** RMS threshold below which input is treated as silence. */
  _silenceThreshold = DEFAULT_SILENCE_THRESHOLD;

  /** Attack smoothing coefficient. */
  _alphaAttack = 0;

  /** Release smoothing coefficient. */
  _alphaRelease = 0;

  /** Silence decay smoothing coefficient. */
  _alphaSilenceDecay = 0;

  /**
   * Running RMS envelope (squared, to avoid sqrt every frame).
   * Smoothed with a one-pole filter.
   */
  _envelopeSq = 0;

  /** Current applied gain (linear). Smoothed toward target. */
  _currentGain = 1.0;

  /**
   * Pre-allocated buffer used for RMS computation.
   * Sized to the largest possible render quantum (128 samples).
   */
  _scratchBuffer = allocateBuffer(128);

  /* ── Constructor ────────────────────────────────────────────── */

  constructor(options) {
    super(options);

    // Compute smoothing coefficients from default speed
    this._applySpeed('medium');

    // Listen for configuration changes from the main thread.
    // Messages are consumed before the next render quantum.
    this.port.onmessage = (event) => this._handleMessage(event.data);
  }

  /* ── Configuration ──────────────────────────────────────────── */

  /**
   * Handle a configuration message from the main thread.
   *
   * Accepted keys:
   *   enabled        : boolean
   *   targetDbfs     : number  (-30 … -12)
   *   maxGainDb      : number  (3 … 20)
   *   strength       : 'low' | 'medium' | 'high'
   *   speed          : 'slow' | 'medium' | 'fast'
   *   noiseGate      : boolean
   *
   * @param {object} msg
   */
  _handleMessage(msg) {
    if (msg == null || typeof msg !== 'object') return;

    if (typeof msg.enabled === 'boolean') {
      this._enabled = msg.enabled;
      if (!msg.enabled) {
        // Reset gain to unity on disable so re-enabling starts clean
        this._currentGain = 1.0;
        this._envelopeSq = 0;
      }
    }

    if (typeof msg.targetDbfs === 'number') {
      this._targetDbfs = Math.max(-30, Math.min(-12, msg.targetDbfs));
    }

    if (typeof msg.maxGainDb === 'number') {
      const clamped = Math.max(3, Math.min(20, msg.maxGainDb));
      this._maxGainDb = clamped;
      this._maxGainLinear = dbToLinear(clamped);
    }

    if (typeof msg.strength === 'string' && STRENGTH_PRESETS[msg.strength] != null) {
      this._strength = STRENGTH_PRESETS[msg.strength];
    }

    if (typeof msg.speed === 'string' && SPEED_PRESETS[msg.speed] != null) {
      this._applySpeed(msg.speed);
    }

    if (typeof msg.noiseGate === 'boolean') {
      this._noiseGateEnabled = msg.noiseGate;
    }
  }

  /**
   * Recompute attack/release/silence-decay coefficients for
   * the current sample rate.
   *
   * IMPORTANT: We use frames-per-second (sampleRate / 128), NOT
   * the raw sampleRate, because the smoothing filter is applied
   * once per render quantum — not once per sample.
   *
   * @param {'slow' | 'medium' | 'fast'} preset
   */
  _applySpeed(preset) {
    const { attack, release } = SPEED_PRESETS[preset];
    const fps = sampleRate / RENDER_QUANTUM_FRAMES;
    this._alphaAttack = timeConstantToAlpha(attack, fps);
    this._alphaRelease = timeConstantToAlpha(release, fps);
    this._alphaSilenceDecay = timeConstantToAlpha(SILENCE_DECAY_TC, fps);
  }

  /* ── DSP ────────────────────────────────────────────────────── */

  /**
   * Main audio processing callback.
   *
   * @param {Float32Array[][]} inputs   Input channels.
   * @param {Float32Array[][]} outputs  Output channels.
   * @returns {boolean}  `true` to keep the processor alive.
   */
  process(inputs, outputs) {
    const input = inputs[0];
    const output = outputs[0];

    // No input → silence pass-through
    if (!input || input.length === 0 || !input[0]) {
      return true;
    }

    const inputChannel = input[0];
    const outputChannel = output[0];
    const numSamples = inputChannel.length;

    // ── Bypass mode ──────────────────────────────────────────
    if (!this._enabled) {
      // Copy input → output unchanged
      for (let i = 0; i < numSamples; i++) {
        outputChannel[i] = inputChannel[i];
      }
      // Also copy any additional channels (stereo, etc.)
      for (let ch = 1; ch < input.length; ch++) {
        if (output[ch] && input[ch]) {
          for (let i = 0; i < numSamples; i++) {
            output[ch][i] = input[ch][i];
          }
        }
      }
      return true;
    }

    // ── 1. RMS level estimation ──────────────────────────────
    //
    // Compute the RMS of this render quantum and smooth it with
    // a one-pole IIR filter (exponential moving average).
    //
    // We work in the squared domain to avoid a sqrt per sample;
    // the sqrt is taken only once per quantum for the gain calc.

    let sumSq = 0;
    for (let i = 0; i < numSamples; i++) {
      const s = inputChannel[i];
      sumSq += s * s;
    }
    const frameMeanSq = sumSq / numSamples;

    // Smooth envelope: use attack for rising levels, release for falling
    const alpha = frameMeanSq > this._envelopeSq
      ? this._alphaAttack
      : this._alphaRelease;
    this._envelopeSq += alpha * (frameMeanSq - this._envelopeSq);

    // Current smoothed RMS (linear)
    const rms = Math.sqrt(Math.max(this._envelopeSq, 1e-12));

    // ── 2. Silence gate ──────────────────────────────────────
    //
    // If the RMS is below the silence threshold, do NOT compute
    // a new gain target. Instead, decay the current gain slowly
    // toward unity (1.0) so we don't slam the listener when
    // speech resumes.

    const isSilence = rms < this._silenceThreshold;

    if (isSilence && this._noiseGateEnabled) {
      // Decay gain toward 1.0
      this._currentGain += this._alphaSilenceDecay * (1.0 - this._currentGain);
    } else if (!isSilence) {
      // ── 3. Target gain computation ───────────────────────
      //
      // Desired gain = 10^((targetDbfs − currentDbfs) / 20)
      //
      // We scale the correction by the strength factor so that
      // "low" strength applies only 50 % of the correction.

      const currentDbfs = linearToDb(rms);
      const errorDb = this._targetDbfs - currentDbfs;

      // Scale the correction by strength.
      // A strength of 1.0 means "correct fully to target".
      const correctionDb = errorDb * this._strength;
      let desiredGain = dbToLinear(correctionDb);

      // Clamp to max gain
      if (desiredGain > this._maxGainLinear) {
        desiredGain = this._maxGainLinear;
      }
      // Never attenuate below a certain floor (we're leveling UP,
      // but very loud signals still need to be pulled down).
      // Allow attenuation down to −6 dB.
      if (desiredGain < 0.5) {
        desiredGain = 0.5;
      }

      // ── 4. Gain smoothing ──────────────────────────────
      //
      // Exponentially interpolate the current gain toward the
      // desired gain. Use fast attack (gain DOWN) and slow
      // release (gain UP) for natural dynamics.

      const gainAlpha = desiredGain < this._currentGain
        ? this._alphaAttack   // Gain is decreasing (loud signal arrived)
        : this._alphaRelease; // Gain is increasing (signal got quieter)

      this._currentGain += gainAlpha * (desiredGain - this._currentGain);
    }
    // If isSilence && !noiseGateEnabled → gain stays as-is (frozen)

    // ── 5. Apply gain & soft-clip ─────────────────────────────
    const gain = this._currentGain;
    for (let i = 0; i < numSamples; i++) {
      outputChannel[i] = softClip(inputChannel[i] * gain);
    }

    // Copy any additional channels with same gain
    for (let ch = 1; ch < input.length; ch++) {
      if (output[ch] && input[ch]) {
        for (let i = 0; i < numSamples; i++) {
          output[ch][i] = softClip(input[ch][i] * gain);
        }
      }
    }

    return true;
  }
}

registerProcessor(PROCESSOR_NAME, VoiceLevelingProcessor);