// Client-side live captions via Whisper-tiny running in the browser.
// Zero server cost: model + library load from huggingface.co + esm.sh,
// inference runs on the viewer's machine (WebGPU preferred, WASM
// fallback). Only works for HLS channels — YouTube iframes are
// cross-origin sandboxed and don't permit audio extraction.
//
// Wiring:
//   <HlsCaptioner channel={ch} onText={(t) => setCaption(t)}
//                 onStatus={(s) => setCaptionStatus(s)} />
// Mount it conditionally when the user enables CC and the current
// channel has an `hls` URL. The component renders nothing; the
// transcription text is delivered via the onText callback so the parent
// can render it however it wants (lower-third overlay inside the CRT).

const { useEffect: useEffectC, useRef: useRefC } = React;

// Map our 2-letter channel.lang to Whisper's `language` parameter.
const WHISPER_LANG = {
  en: "english",
  fr: "french",
  es: "spanish",
  ja: "japanese",
  ko: "korean",
};

// One pipeline per page — model load is expensive (~39 MB first time)
// and reuse is mandatory. Persistent across channel switches; the
// `language` is passed per-inference so we don't reload the model.
let __whisperPipeline = null;
let __whisperLoading = null;

async function getWhisperPipeline(onProgress) {
  if (__whisperPipeline) return __whisperPipeline;
  if (__whisperLoading) return __whisperLoading;

  __whisperLoading = (async () => {
    // ESM import via CDN — Babel-standalone leaves dynamic import() as-is.
    const tfjs = await import("https://esm.sh/@huggingface/transformers@3.4.2");
    const { pipeline, env } = tfjs;

    // Browser cache (IndexedDB) for the model so reloads are fast.
    env.allowLocalModels = false;
    env.useBrowserCache = true;

    // Whisper-tiny multilingual: ~39 MB, good enough for news anchors,
    // covers en/fr/es/ja/ko. Upgrade to whisper-base (~74 MB) if quality
    // matters more than load time.
    const modelId = "Xenova/whisper-tiny";

    // Prefer WebGPU; fall back to WASM if unavailable or the device
    // refuses (older Chrome, Firefox stable, no GPU).
    let asr;
    try {
      asr = await pipeline("automatic-speech-recognition", modelId, {
        device: "webgpu",
        dtype: "fp32",
        progress_callback: onProgress,
      });
    } catch (err) {
      console.warn("[captions] WebGPU init failed, using WASM:", err.message);
      asr = await pipeline("automatic-speech-recognition", modelId, {
        progress_callback: onProgress,
      });
    }
    __whisperPipeline = asr;
    return asr;
  })();

  return __whisperLoading;
}

function HlsCaptioner({ channel, onText, onStatus }) {
  // The captioner re-attaches when the channel changes (so audio flows
  // from the *current* HLS video element). Model + pipeline are reused.
  useEffectC(() => {
    let cancelled = false;
    let cleanup = null;

    (async () => {
      onStatus({ kind: "loading", message: "Initializing…", progress: 0 });

      let asr;
      try {
        asr = await getWhisperPipeline((p) => {
          if (cancelled) return;
          if (p.status === "progress" && p.total) {
            const pct = Math.round((p.loaded / p.total) * 100);
            onStatus({
              kind: "loading",
              progress: pct,
              message: `Loading captions… ${pct}%`,
            });
          } else if (p.status === "done" || p.status === "ready") {
            onStatus({ kind: "active", message: "Listening…" });
          }
        });
      } catch (err) {
        if (cancelled) return;
        console.error("[captions] pipeline load failed:", err);
        onStatus({ kind: "error", message: "Captions failed to load" });
        return;
      }
      if (cancelled) return;

      onStatus({ kind: "active", message: "Listening…" });

      // Find the visible HLS <video>. The TV component renders three
      // feeds (prev/cur/next); only one carries `.active`. Wait briefly
      // for it to appear if the user just switched channels.
      let videoEl = null;
      for (let i = 0; i < 15; i++) {
        videoEl = document.querySelector("video.crt-feed.active");
        if (videoEl && videoEl.readyState >= 2) break;
        await new Promise((r) => setTimeout(r, 200));
      }
      if (!videoEl) {
        onStatus({ kind: "error", message: "No active HLS video" });
        return;
      }

      // captureStream() yields a MediaStream with whatever's playing.
      // Firefox uses mozCaptureStream until ~v124, hence the fallback.
      let stream;
      try {
        stream = videoEl.captureStream
          ? videoEl.captureStream()
          : videoEl.mozCaptureStream();
      } catch (err) {
        onStatus({ kind: "error", message: "captureStream unavailable" });
        return;
      }
      const audioTracks = stream.getAudioTracks();
      if (audioTracks.length === 0) {
        onStatus({ kind: "error", message: "No audio track in stream" });
        return;
      }

      const audioCtx = new (window.AudioContext || window.webkitAudioContext)({
        sampleRate: 16000,
      });
      const source = audioCtx.createMediaStreamSource(
        new MediaStream([audioTracks[0]])
      );
      // ScriptProcessor is deprecated but ships everywhere; the
      // AudioWorklet alternative needs a separate worklet file which
      // doesn't fit cleanly into a Babel-CDN setup.
      const processor = audioCtx.createScriptProcessor(4096, 1, 1);
      // Route through a muted gain so the captured audio doesn't
      // double-play (the <video> element already drives the speakers).
      const muteGain = audioCtx.createGain();
      muteGain.gain.value = 0;
      source.connect(processor);
      processor.connect(muteGain);
      muteGain.connect(audioCtx.destination);

      const CHUNK_SECONDS = 5;
      const TARGET = 16000 * CHUNK_SECONDS;
      let buffer = new Float32Array(0);
      let busy = false;

      processor.onaudioprocess = async (ev) => {
        if (cancelled) return;
        const samples = ev.inputBuffer.getChannelData(0);
        // Append to rolling buffer.
        const merged = new Float32Array(buffer.length + samples.length);
        merged.set(buffer);
        merged.set(samples, buffer.length);
        buffer = merged;

        if (buffer.length >= TARGET && !busy) {
          busy = true;
          const chunk = buffer.slice(0, TARGET);
          // Clean cut — drop the just-processed samples, keep anything
          // that came in during inference. Overlap was tried but caused
          // duplicate words at chunk boundaries.
          buffer = buffer.slice(TARGET);
          try {
            const result = await asr(chunk, {
              language: WHISPER_LANG[channel.lang] || "english",
              task: "transcribe",
              return_timestamps: false,
              // Whisper sometimes hallucinates "Thank you" / "Subtitles by"
              // on silence. The suppress_tokens default helps; quality is
              // good enough for v1.
            });
            if (!cancelled && result && result.text) {
              const text = result.text.trim();
              if (text.length > 0) onText(text);
            }
          } catch (err) {
            console.warn("[captions] inference error:", err.message);
          } finally {
            busy = false;
          }
        }
      };

      cleanup = () => {
        try { processor.disconnect(); } catch (_) {}
        try { source.disconnect(); } catch (_) {}
        try { muteGain.disconnect(); } catch (_) {}
        try { audioCtx.close(); } catch (_) {}
        try { stream.getTracks().forEach((t) => t.stop()); } catch (_) {}
      };
    })();

    return () => {
      cancelled = true;
      if (cleanup) cleanup();
    };
  }, [channel.num]);

  return null;
}

Object.assign(window, { HlsCaptioner });
