{
+ if (!videoPath) {
+ toast.error(t("errors.noVideoLoaded"));
+ return;
+ }
+ if (isAutoCaptioningRef.current) {
+ toast.error(t("autoCaptions.busy"));
+ return;
+ }
+ setShowAutoCaptionsDialog(true);
+ }}
/>
diff --git a/src/components/video-editor/projectPersistence.ts b/src/components/video-editor/projectPersistence.ts
index ff59427f2..1fefa43e9 100644
--- a/src/components/video-editor/projectPersistence.ts
+++ b/src/components/video-editor/projectPersistence.ts
@@ -333,6 +333,8 @@ export function normalizeProjectEditor(editor: Partial
): Pro
content: typeof region.content === "string" ? region.content : "",
textContent: typeof region.textContent === "string" ? region.textContent : undefined,
imageContent: typeof region.imageContent === "string" ? region.imageContent : undefined,
+ annotationSource:
+ region.annotationSource === "auto-caption" ? ("auto-caption" as const) : undefined,
position: {
x: clamp(
isFiniteNumber(region.position?.x)
diff --git a/src/components/video-editor/timeline/TimelineEditor.tsx b/src/components/video-editor/timeline/TimelineEditor.tsx
index f84d038a9..65ebd8bdb 100644
--- a/src/components/video-editor/timeline/TimelineEditor.tsx
+++ b/src/components/video-editor/timeline/TimelineEditor.tsx
@@ -1,6 +1,7 @@
import type { Range, Span } from "dnd-timeline";
import { useTimelineContext } from "dnd-timeline";
import {
+ Captions,
Check,
ChevronDown,
Gauge,
@@ -92,6 +93,11 @@ interface TimelineEditorProps {
onAspectRatioChange: (aspectRatio: AspectRatio) => void;
videoUrl?: string;
showTrimWaveform?: boolean;
+ /** Opens the auto-captions flow. When omitted, the captions button is hidden. */
+ onGenerateCaptions?: () => void;
+ isGeneratingCaptions?: boolean;
+ /** Localized label for the auto-captions button (lives in the `editor` namespace). */
+ captionsLabel?: string;
}
interface TimelineScaleConfig {
@@ -924,6 +930,9 @@ export default function TimelineEditor({
onAspectRatioChange,
videoUrl,
showTrimWaveform = false,
+ onGenerateCaptions,
+ isGeneratingCaptions = false,
+ captionsLabel,
}: TimelineEditorProps) {
const t = useScopedT("timeline");
const totalMs = useMemo(() => Math.max(0, Math.round(videoDuration * 1000)), [videoDuration]);
@@ -1659,6 +1668,18 @@ export default function TimelineEditor({
>
+ {onGenerateCaptions && (
+
+ )}
diff --git a/src/components/video-editor/types.ts b/src/components/video-editor/types.ts
index 0f2267cca..1aca90af1 100644
--- a/src/components/video-editor/types.ts
+++ b/src/components/video-editor/types.ts
@@ -288,6 +288,8 @@ export interface AnnotationRegion {
size: AnnotationSize;
style: AnnotationTextStyle;
zIndex: number;
+ /** When set, layout/style edits on one region can sync to all auto-caption siblings. */
+ annotationSource?: "auto-caption";
figureData?: FigureData;
blurData?: BlurData;
}
diff --git a/src/i18n/locales/ar/editor.json b/src/i18n/locales/ar/editor.json
index b3e122280..39750e5eb 100644
--- a/src/i18n/locales/ar/editor.json
+++ b/src/i18n/locales/ar/editor.json
@@ -44,6 +44,25 @@
"permissionDenied": "تم رفض إذن التسجيل. يرجى السماح بتسجيل الشاشة.",
"accessibilityAllowAndRetry": "اسمح بوصول تسهيلات الاستخدام لـ OpenScreen، ثم اضغط على التسجيل مرة أخرى لبدء العد التنازلي."
},
+ "autoCaptions": {
+ "button": "التسميات التوضيحية التلقائية",
+ "dialogTitle": "التسميات التوضيحية التلقائية",
+ "dialogDescription": "اختر تقريبا كم عدد الكلمات التي تظهر في كل تسمية توضيحية. يتم توزيع التوقيت عبر الكلمات في تلك العبارة.",
+ "minWords": "الحد الأدنى من الكلمات لكل تسمية",
+ "maxWords": "الحد الأقصى من الكلمات لكل تسمية",
+ "wordsCount": "{{count}} كلمة",
+ "generate": "توليد",
+ "dialogCancel": "إلغاء",
+ "generating": "جارٍ توليد التسميات من الصوت…",
+ "loadingModel": "جارٍ تحميل نموذج الكلام (سيتم تنزيل ~75 ميغابايت عند الاستخدام الأول)…",
+ "transcribing": "جارٍ نسخ الكلام إلى نص…",
+ "busy": "توليد التسميات قيد التنفيذ بالفعل.",
+ "done": "تمت إضافة {{count}} تسمية.",
+ "noneHeard": "لم يتم الكشف عن أي كلام.",
+ "noAudio": "لا يحتوي هذا الفيديو على صوت صالح للنسخ.",
+ "failed": "تعذّر توليد التسميات.",
+ "truncated": "تم نسخ الدقائق الأولى فقط: {{minutes}} دقيقة."
+ },
"emptyState": {
"title": "لا يوجد مشروع مفتوح",
"description": "استورد مقطع فيديو للبدء في التحرير، أو حمّل مشروع OpenScreen موجود.",
diff --git a/src/i18n/locales/en/editor.json b/src/i18n/locales/en/editor.json
index ebd9a5d5f..d6a56f033 100644
--- a/src/i18n/locales/en/editor.json
+++ b/src/i18n/locales/en/editor.json
@@ -44,6 +44,25 @@
"permissionDenied": "Recording permission denied. Please allow screen recording.",
"accessibilityAllowAndRetry": "Allow Accessibility access for OpenScreen, then press record again to start the countdown."
},
+ "autoCaptions": {
+ "button": "Auto captions",
+ "dialogTitle": "Auto captions",
+ "dialogDescription": "Choose roughly how many words each caption shows at once. Timing is spread across the words in that phrase.",
+ "minWords": "Minimum words per caption",
+ "maxWords": "Maximum words per caption",
+ "wordsCount": "{{count}} words",
+ "generate": "Generate",
+ "dialogCancel": "Cancel",
+ "generating": "Generating captions from audio…",
+ "loadingModel": "Loading speech model (first use downloads ~75 MB)…",
+ "transcribing": "Transcribing speech…",
+ "busy": "Caption generation is already in progress.",
+ "done": "Added {{count}} captions.",
+ "noneHeard": "No speech was detected.",
+ "noAudio": "This video has no usable audio to transcribe.",
+ "failed": "Could not generate captions.",
+ "truncated": "Only the first {{minutes}} minutes were transcribed."
+ },
"emptyState": {
"title": "No project open",
"description": "Import a video to start editing, or load an existing OpenScreen project.",
diff --git a/src/i18n/locales/es/editor.json b/src/i18n/locales/es/editor.json
index 16a2c8547..277ce40ff 100644
--- a/src/i18n/locales/es/editor.json
+++ b/src/i18n/locales/es/editor.json
@@ -44,6 +44,25 @@
"cancel": "Cancelar",
"confirm": "Confirmar"
},
+ "autoCaptions": {
+ "button": "Subtítulos automáticos",
+ "dialogTitle": "Subtítulos automáticos",
+ "dialogDescription": "Elige aproximadamente cuántas palabras muestra cada subtítulo a la vez. El tiempo se reparte entre las palabras de esa frase.",
+ "minWords": "Número mínimo de palabras por subtítulo",
+ "maxWords": "Número máximo de palabras por subtítulo",
+ "wordsCount": "{{count}} palabras",
+ "generate": "Generar",
+ "dialogCancel": "Cancelar",
+ "generating": "Generando subtítulos a partir del audio…",
+ "loadingModel": "Cargando el modelo de voz (el primer uso descarga ~75 MB)…",
+ "transcribing": "Transcribiendo el habla…",
+ "busy": "La generación de subtítulos ya está en curso.",
+ "done": "Se añadieron {{count}} subtítulos.",
+ "noneHeard": "No se detectó voz.",
+ "noAudio": "Este video no tiene audio utilizable para transcribir.",
+ "failed": "No se pudieron generar los subtítulos.",
+ "truncated": "Solo se transcribieron los primeros {{minutes}} minutos."
+ },
"emptyState": {
"title": "No hay proyecto abierto",
"description": "Importa un video para empezar a editar o carga un proyecto de OpenScreen existente.",
diff --git a/src/i18n/locales/fr/editor.json b/src/i18n/locales/fr/editor.json
index 4eb57a9cc..40dc24fd7 100644
--- a/src/i18n/locales/fr/editor.json
+++ b/src/i18n/locales/fr/editor.json
@@ -44,6 +44,25 @@
},
"loadingVideo": "Chargement de la vidéo...",
"loadingEditor": "Chargement de l'éditeur...",
+ "autoCaptions": {
+ "button": "Sous-titres automatiques",
+ "dialogTitle": "Sous-titres automatiques",
+ "dialogDescription": "Choisissez approximativement combien de mots chaque sous-titre affiche à la fois. Le timing est réparti entre les mots de cette phrase.",
+ "minWords": "Nombre minimum de mots par sous-titre",
+ "maxWords": "Nombre maximum de mots par sous-titre",
+ "wordsCount": "{{count}} mots",
+ "generate": "Générer",
+ "dialogCancel": "Annuler",
+ "generating": "Génération des sous-titres à partir de l'audio…",
+ "loadingModel": "Chargement du modèle vocal (le premier usage télécharge ~75 MB)…",
+ "transcribing": "Transcription de la parole…",
+ "busy": "La génération des sous-titres est déjà en cours.",
+ "done": "{{count}} sous-titres ajoutés.",
+ "noneHeard": "Aucune parole n'a été détectée.",
+ "noAudio": "Cette vidéo ne contient pas d'audio exploitable pour la transcription.",
+ "failed": "Impossible de générer les sous-titres.",
+ "truncated": "Seules les {{minutes}} premières minutes ont été transcrites."
+ },
"emptyState": {
"title": "Aucun projet ouvert",
"description": "Importez une vidéo pour commencer à éditer, ou chargez un projet OpenScreen existant.",
diff --git a/src/i18n/locales/it/editor.json b/src/i18n/locales/it/editor.json
index 336d3e6ba..0e94b9a9f 100644
--- a/src/i18n/locales/it/editor.json
+++ b/src/i18n/locales/it/editor.json
@@ -42,5 +42,24 @@
"cameraNotFound": "Fotocamera non trovata.",
"permissionDenied": "Autorizzazione di registrazione negata. Consenti la registrazione dello schermo.",
"accessibilityAllowAndRetry": "Consenti l'accesso all'accessibilità per OpenScreen, poi premi di nuovo registra per avviare il conto alla rovescia."
+ },
+ "autoCaptions": {
+ "button": "Sottotitoli automatici",
+ "dialogTitle": "Sottotitoli automatici",
+ "dialogDescription": "Scegli all'incirca quante parole mostrare per ogni sottotitolo. La temporizzazione viene distribuita tra le parole della frase.",
+ "minWords": "Numero minimo di parole per sottotitolo",
+ "maxWords": "Numero massimo di parole per sottotitolo",
+ "wordsCount": "{{count}} parole",
+ "generate": "Genera",
+ "dialogCancel": "Annulla",
+ "generating": "Generazione dei sottotitoli dall'audio…",
+ "loadingModel": "Caricamento del modello vocale (al primo utilizzo vengono scaricati ~75 MB)…",
+ "transcribing": "Trascrizione del parlato…",
+ "busy": "La generazione dei sottotitoli è già in corso.",
+ "done": "Aggiunti {{count}} sottotitoli.",
+ "noneHeard": "Nessun parlato rilevato.",
+ "noAudio": "Questo video non contiene audio utilizzabile per la trascrizione.",
+ "failed": "Impossibile generare i sottotitoli.",
+ "truncated": "Sono stati trascritti solo i primi {{minutes}} minuti."
}
}
diff --git a/src/i18n/locales/ja-JP/editor.json b/src/i18n/locales/ja-JP/editor.json
index 5151d1054..8e0da42e1 100644
--- a/src/i18n/locales/ja-JP/editor.json
+++ b/src/i18n/locales/ja-JP/editor.json
@@ -44,6 +44,25 @@
"cameraNotFound": "カメラが見つかりません。",
"accessibilityAllowAndRetry": "OpenScreenにアクセシビリティアクセスを許可してから、もう一度録画を押してカウントダウンを開始してください。"
},
+ "autoCaptions": {
+ "button": "自動キャプション",
+ "dialogTitle": "自動キャプション",
+ "dialogDescription": "各キャプションに一度に表示する語数の目安を選びます。タイミングはそのフレーズ内の語に分配されます。",
+ "minWords": "キャプションあたりの最小語数",
+ "maxWords": "キャプションあたりの最大語数",
+ "wordsCount": "{{count}} 語",
+ "generate": "生成",
+ "dialogCancel": "キャンセル",
+ "generating": "音声からキャプションを生成しています…",
+ "loadingModel": "音声モデルを読み込んでいます(初回利用時は約 75 MB をダウンロードします)…",
+ "transcribing": "音声を文字起こししています…",
+ "busy": "キャプションの生成はすでに実行中です。",
+ "done": "{{count}} 件のキャプションを追加しました。",
+ "noneHeard": "音声が検出されませんでした。",
+ "noAudio": "この動画には書き起こしに使える音声がありません。",
+ "failed": "キャプションを生成できませんでした。",
+ "truncated": "最初の {{minutes}} 分のみが書き起こされました。"
+ },
"emptyState": {
"title": "プロジェクトが開かれていません",
"description": "動画をインポートして編集を開始するか、既存の OpenScreen プロジェクトを読み込んでください。",
diff --git a/src/i18n/locales/ko-KR/editor.json b/src/i18n/locales/ko-KR/editor.json
index 23990c386..a63a22a57 100644
--- a/src/i18n/locales/ko-KR/editor.json
+++ b/src/i18n/locales/ko-KR/editor.json
@@ -44,6 +44,25 @@
"cameraNotFound": "카메라를 찾을 수 없습니다.",
"accessibilityAllowAndRetry": "OpenScreen의 손쉬운 사용 접근을 허용한 다음, 카운트다운을 시작하려면 다시 녹화를 누르세요."
},
+ "autoCaptions": {
+ "button": "자동 자막",
+ "dialogTitle": "자동 자막",
+ "dialogDescription": "각 자막에 한 번에 표시할 단어 수의 대략적인 값을 선택하세요. 타이밍은 해당 구문의 단어들에 나뉩니다.",
+ "minWords": "자막당 최소 단어 수",
+ "maxWords": "자막당 최대 단어 수",
+ "wordsCount": "{{count}}개 단어",
+ "generate": "생성",
+ "dialogCancel": "취소",
+ "generating": "오디오에서 자막을 생성하는 중…",
+ "loadingModel": "음성 모델을 불러오는 중(첫 사용 시 약 75MB 다운로드)…",
+ "transcribing": "음성을 전사하는 중…",
+ "busy": "자막 생성이 이미 진행 중입니다.",
+ "done": "자막 {{count}}개를 추가했습니다.",
+ "noneHeard": "음성이 감지되지 않았습니다.",
+ "noAudio": "이 동영상에는 전사에 사용할 수 있는 음성이 없습니다.",
+ "failed": "자막을 생성할 수 없습니다.",
+ "truncated": "처음 {{minutes}}분만 전사되었습니다."
+ },
"emptyState": {
"title": "열린 프로젝트 없음",
"description": "동영상을 가져와 편집을 시작하거나 기존 OpenScreen 프로젝트를 불러오세요.",
diff --git a/src/i18n/locales/pt-BR/editor.json b/src/i18n/locales/pt-BR/editor.json
index 7e3f69531..b0e9ab8c9 100644
--- a/src/i18n/locales/pt-BR/editor.json
+++ b/src/i18n/locales/pt-BR/editor.json
@@ -41,5 +41,24 @@
"cameraDisconnected": "Webcam desconectada.",
"cameraNotFound": "Câmera não encontrada.",
"permissionDenied": "Permissão de gravação negada. Por favor, permita a gravação de tela."
+ },
+ "autoCaptions": {
+ "button": "Legendas automáticas",
+ "dialogTitle": "Legendas automáticas",
+ "dialogDescription": "Escolha aproximadamente quantas palavras cada legenda mostra de cada vez. O tempo é distribuído entre as palavras da frase.",
+ "minWords": "Mínimo de palavras por legenda",
+ "maxWords": "Máximo de palavras por legenda",
+ "wordsCount": "{{count}} palavras",
+ "generate": "Gerar",
+ "dialogCancel": "Cancelar",
+ "generating": "Gerando legendas a partir do áudio…",
+ "loadingModel": "Carregando o modelo de fala (o primeiro uso baixa ~75 MB)…",
+ "transcribing": "Transcrevendo a fala…",
+ "busy": "A geração de legendas já está em andamento.",
+ "done": "{{count}} legendas adicionadas.",
+ "noneHeard": "Nenhuma fala foi detectada.",
+ "noAudio": "Este vídeo não tem áudio utilizável para transcrição.",
+ "failed": "Não foi possível gerar as legendas.",
+ "truncated": "Apenas os primeiros {{minutes}} minutos foram transcritos."
}
}
diff --git a/src/i18n/locales/ru/editor.json b/src/i18n/locales/ru/editor.json
index ff0c80b8b..78fa129a1 100644
--- a/src/i18n/locales/ru/editor.json
+++ b/src/i18n/locales/ru/editor.json
@@ -44,6 +44,25 @@
"permissionDenied": "Разрешение на запись запрещено. Пожалуйста, разрешите запись экрана.",
"accessibilityAllowAndRetry": "Разрешите OpenScreen доступ к Универсальному доступу, затем снова нажмите запись, чтобы начать обратный отсчет."
},
+ "autoCaptions": {
+ "button": "Автосубтитры",
+ "dialogTitle": "Автосубтитры",
+ "dialogDescription": "Выберите, сколько примерно слов показывать в одном субтитре. Время распределяется между словами фразы.",
+ "minWords": "Минимум слов в субтитре",
+ "maxWords": "Максимум слов в субтитре",
+ "wordsCount": "{{count}} слов",
+ "generate": "Создать",
+ "dialogCancel": "Отмена",
+ "generating": "Создание субтитров из звука…",
+ "loadingModel": "Загрузка речевой модели (при первом запуске скачивается ~75 МБ)…",
+ "transcribing": "Распознавание речи…",
+ "busy": "Создание субтитров уже выполняется.",
+ "done": "Добавлено субтитров: {{count}}.",
+ "noneHeard": "Речь не обнаружена.",
+ "noAudio": "В этом видео нет звука, пригодного для расшифровки.",
+ "failed": "Не удалось создать субтитры.",
+ "truncated": "Расшифрованы только первые {{minutes}} мин."
+ },
"emptyState": {
"title": "Нет открытых проектов",
"description": "Импортируйте видео для начала редактирования или загрузите существующий проект OpenScreen.",
diff --git a/src/i18n/locales/tr/editor.json b/src/i18n/locales/tr/editor.json
index de45a180f..89203e719 100644
--- a/src/i18n/locales/tr/editor.json
+++ b/src/i18n/locales/tr/editor.json
@@ -44,6 +44,25 @@
"cancel": "İptal",
"confirm": "Onayla"
},
+ "autoCaptions": {
+ "button": "Otomatik altyazılar",
+ "dialogTitle": "Otomatik altyazılar",
+ "dialogDescription": "Her altyazının aynı anda yaklaşık kaç kelime göstermesini istediğinizi seçin. Zamanlama, o ifadedeki kelimelere dağıtılır.",
+ "minWords": "Altyazı başına en az kelime",
+ "maxWords": "Altyazı başına en fazla kelime",
+ "wordsCount": "{{count}} kelime",
+ "generate": "Oluştur",
+ "dialogCancel": "İptal",
+ "generating": "Sesten altyazılar oluşturuluyor…",
+ "loadingModel": "Konuşma modeli yükleniyor (ilk kullanımda ~75 MB indirilir)…",
+ "transcribing": "Konuşma yazıya dökülüyor…",
+ "busy": "Altyazı oluşturma zaten devam ediyor.",
+ "done": "{{count}} altyazı eklendi.",
+ "noneHeard": "Konuşma algılanmadı.",
+ "noAudio": "Bu videoda yazıya dökülebilecek kullanılabilir bir ses yok.",
+ "failed": "Altyazılar oluşturulamadı.",
+ "truncated": "Yalnızca ilk {{minutes}} dakika yazıya döküldü."
+ },
"emptyState": {
"title": "Açık proje yok",
"description": "Düzenlemeye başlamak için bir video içe aktarın veya mevcut bir OpenScreen projesi yükleyin.",
diff --git a/src/i18n/locales/vi/editor.json b/src/i18n/locales/vi/editor.json
index 1875bb559..90004091e 100644
--- a/src/i18n/locales/vi/editor.json
+++ b/src/i18n/locales/vi/editor.json
@@ -44,6 +44,25 @@
"permissionDenied": "Quyền ghi hình bị từ chối. Vui lòng cho phép ghi màn hình.",
"accessibilityAllowAndRetry": "Cho phép OpenScreen truy cập Trợ năng, sau đó nhấn ghi lại để bắt đầu đếm ngược."
},
+ "autoCaptions": {
+ "button": "Phụ đề tự động",
+ "dialogTitle": "Phụ đề tự động",
+ "dialogDescription": "Chọn khoảng bao nhiêu từ mỗi phụ đề hiển thị cùng lúc. Thời gian được phân bổ cho các từ trong cụm từ đó.",
+ "minWords": "Số từ tối thiểu mỗi phụ đề",
+ "maxWords": "Số từ tối đa mỗi phụ đề",
+ "wordsCount": "{{count}} từ",
+ "generate": "Tạo",
+ "dialogCancel": "Hủy",
+ "generating": "Đang tạo phụ đề từ âm thanh…",
+ "loadingModel": "Đang tải mô hình giọng nói (lần đầu sử dụng sẽ tải ~75 MB)…",
+ "transcribing": "Đang chuyển lời nói thành văn bản…",
+ "busy": "Việc tạo phụ đề đang được tiến hành.",
+ "done": "Đã thêm {{count}} phụ đề.",
+ "noneHeard": "Không phát hiện thấy lời nói.",
+ "noAudio": "Video này không có âm thanh dùng được để chuyển thành văn bản.",
+ "failed": "Không thể tạo phụ đề.",
+ "truncated": "Chỉ {{minutes}} phút đầu tiên được chuyển thành văn bản."
+ },
"emptyState": {
"title": "Không có dự án nào được mở",
"description": "Nhập video để bắt đầu chỉnh sửa hoặc tải một dự án OpenScreen hiện có.",
diff --git a/src/i18n/locales/zh-CN/editor.json b/src/i18n/locales/zh-CN/editor.json
index d11f1dd95..58f6ae27b 100644
--- a/src/i18n/locales/zh-CN/editor.json
+++ b/src/i18n/locales/zh-CN/editor.json
@@ -44,6 +44,25 @@
"permissionDenied": "录屏权限被拒绝。请允许屏幕录制。",
"accessibilityAllowAndRetry": "允许 OpenScreen 使用辅助功能权限,然后再次按录制以开始倒计时。"
},
+ "autoCaptions": {
+ "button": "自动字幕",
+ "dialogTitle": "自动字幕",
+ "dialogDescription": "大致选择每条字幕一次显示多少个字词。时间会在该语句内的字词之间分配。",
+ "minWords": "每条字幕的最少字数",
+ "maxWords": "每条字幕的最多字数",
+ "wordsCount": "{{count}} 个词",
+ "generate": "生成",
+ "dialogCancel": "取消",
+ "generating": "正在从音频生成字幕…",
+ "loadingModel": "正在加载语音模型(首次使用将下载约 75 MB)…",
+ "transcribing": "正在转写语音…",
+ "busy": "字幕生成已在进行中。",
+ "done": "已添加 {{count}} 条字幕。",
+ "noneHeard": "未检测到语音。",
+ "noAudio": "此视频没有可用于转写的音频。",
+ "failed": "无法生成字幕。",
+ "truncated": "仅转写了最前 {{minutes}} 分钟。"
+ },
"emptyState": {
"title": "未打开任何项目",
"description": "导入视频开始编辑,或加载已有的 OpenScreen 项目。",
diff --git a/src/i18n/locales/zh-TW/editor.json b/src/i18n/locales/zh-TW/editor.json
index 131518713..8a6485409 100644
--- a/src/i18n/locales/zh-TW/editor.json
+++ b/src/i18n/locales/zh-TW/editor.json
@@ -44,6 +44,25 @@
"cameraNotFound": "找不到攝影機。",
"accessibilityAllowAndRetry": "允許 OpenScreen 使用輔助使用權限,然後再次按下錄製以開始倒數。"
},
+ "autoCaptions": {
+ "button": "自動字幕",
+ "dialogTitle": "自動字幕",
+ "dialogDescription": "大致選擇每條字幕一次顯示多少字詞。時間會在該語句內的字詞之間分配。",
+ "minWords": "每條字幕的最少字數",
+ "maxWords": "每條字幕的最多字數",
+ "wordsCount": "{{count}} 個詞",
+ "generate": "產生",
+ "dialogCancel": "取消",
+ "generating": "正在從音訊產生字幕…",
+ "loadingModel": "正在載入語音模型(首次使用將下載約 75 MB)…",
+ "transcribing": "正在轉錄語音…",
+ "busy": "字幕產生已在進行中。",
+ "done": "已新增 {{count}} 條字幕。",
+ "noneHeard": "未偵測到語音。",
+ "noAudio": "此影片沒有可用於轉寫的音訊。",
+ "failed": "無法產生字幕。",
+ "truncated": "僅轉寫了最前 {{minutes}} 分鐘。"
+ },
"emptyState": {
"title": "未開啟任何專案",
"description": "匯入影片以開始編輯,或載入現有的 OpenScreen 專案。",
diff --git a/src/lib/captioning/annotationsFromCaptions.test.ts b/src/lib/captioning/annotationsFromCaptions.test.ts
new file mode 100644
index 000000000..bbf26fed2
--- /dev/null
+++ b/src/lib/captioning/annotationsFromCaptions.test.ts
@@ -0,0 +1,178 @@
+import { describe, expect, it } from "vitest";
+
+import {
+ captionSegmentsToAnnotationRegions,
+ groupPhraseCaptionSegmentsIntoLines,
+ groupTimedCaptionWordsIntoLines,
+ reconcileAutoCaptionTimelineGaps,
+} from "./annotationsFromCaptions";
+
+describe("groupPhraseCaptionSegmentsIntoLines", () => {
+ it("preserves phrase boundaries when formatting phrase-timestamp captions", () => {
+ const lines = groupPhraseCaptionSegmentsIntoLines(
+ [
+ { startSec: 0, endSec: 0.5, text: "alpha beta" },
+ { startSec: 0.62, endSec: 1.6, text: "gamma delta" },
+ ],
+ 2,
+ 2,
+ );
+
+ expect(lines).toHaveLength(2);
+ expect(lines[0]).toMatchObject({ text: "alpha beta", startSec: 0 });
+ expect(lines[1]).toMatchObject({ text: "gamma delta", startSec: 0.62 });
+ expect(lines[0]!.endSec).toBeLessThanOrEqual(0.62);
+ });
+
+ it("slices a single merged phrase into timed caption lines by word bounds", () => {
+ const lines = groupPhraseCaptionSegmentsIntoLines(
+ [{ startSec: 0, endSec: 1, text: "alpha beta gamma delta" }],
+ 2,
+ 2,
+ );
+
+ expect(lines).toHaveLength(2);
+ expect(lines[0]).toMatchObject({
+ startSec: 0,
+ endSec: 0.5,
+ text: "alpha beta",
+ });
+ expect(lines[1]).toMatchObject({
+ startSec: 0.5,
+ endSec: 1,
+ text: "gamma delta",
+ });
+ });
+});
+
+describe("captionSegmentsToAnnotationRegions", () => {
+ it("uses raw phrase timing instead of shifting caption boundaries", () => {
+ const { regions } = captionSegmentsToAnnotationRegions(
+ [
+ { startSec: 0, endSec: 0.5, text: "first second" },
+ { startSec: 0.62, endSec: 1.2, text: "third fourth" },
+ ],
+ 1,
+ 1,
+ { minWordsPerCaption: 2, maxWordsPerCaption: 2, timestampGranularity: "phrase" },
+ );
+
+ expect(regions).toHaveLength(2);
+ expect(regions[0]).toMatchObject({ startMs: 0, endMs: 500 });
+ expect(regions[1]).toMatchObject({ startMs: 620, endMs: 1200 });
+ });
+
+ it("preserves empty timeline space when word timestamps contain a real pause", () => {
+ const lines = groupTimedCaptionWordsIntoLines(
+ [
+ { startSec: 0, endSec: 0.12, text: "first" },
+ { startSec: 0.13, endSec: 0.28, text: "caption" },
+ { startSec: 0.7, endSec: 0.83, text: "second" },
+ { startSec: 0.84, endSec: 0.98, text: "caption" },
+ ],
+ 2,
+ 2,
+ );
+
+ expect(lines).toHaveLength(2);
+ expect(lines[0]).toMatchObject({ startSec: 0, endSec: 0.28, text: "first caption" });
+ expect(lines[1]).toMatchObject({ startSec: 0.7, endSec: 0.98, text: "second caption" });
+ });
+
+ it("preserves repeated words before grouping in word mode", () => {
+ const { regions } = captionSegmentsToAnnotationRegions(
+ [
+ { startSec: 0, endSec: 0.12, text: "I" },
+ { startSec: 0.13, endSec: 0.25, text: "I" },
+ ],
+ 1,
+ 1,
+ { minWordsPerCaption: 2, maxWordsPerCaption: 2, timestampGranularity: "word" },
+ );
+
+ expect(regions).toHaveLength(1);
+ expect(regions[0]).toMatchObject({ content: "I I" });
+ });
+});
+
+describe("reconcileAutoCaptionTimelineGaps", () => {
+ it("does not change regions when the minimum enforced gap is zero", () => {
+ const regions = reconcileAutoCaptionTimelineGaps([
+ {
+ id: "annotation-1",
+ startMs: 0,
+ endMs: 120,
+ type: "text",
+ content: "one",
+ annotationSource: "auto-caption",
+ position: { x: 0, y: 0 },
+ size: { width: 10, height: 10 },
+ style: {
+ color: "#fff",
+ backgroundColor: "transparent",
+ fontSize: 24,
+ fontFamily: "Inter",
+ fontWeight: "normal",
+ fontStyle: "normal",
+ textDecoration: "none",
+ textAlign: "center",
+ },
+ zIndex: 1,
+ },
+ {
+ id: "manual-1",
+ startMs: 50,
+ endMs: 1000,
+ type: "text",
+ content: "manual",
+ position: { x: 10, y: 10 },
+ size: { width: 10, height: 10 },
+ style: {
+ color: "#fff",
+ backgroundColor: "transparent",
+ fontSize: 24,
+ fontFamily: "Inter",
+ fontWeight: "normal",
+ fontStyle: "normal",
+ textDecoration: "none",
+ textAlign: "center",
+ },
+ zIndex: 2,
+ },
+ {
+ id: "annotation-2",
+ startMs: 130,
+ endMs: 300,
+ type: "text",
+ content: "two",
+ annotationSource: "auto-caption",
+ position: { x: 0, y: 0 },
+ size: { width: 10, height: 10 },
+ style: {
+ color: "#fff",
+ backgroundColor: "transparent",
+ fontSize: 24,
+ fontFamily: "Inter",
+ fontWeight: "normal",
+ fontStyle: "normal",
+ textDecoration: "none",
+ textAlign: "center",
+ },
+ zIndex: 3,
+ },
+ ]);
+
+ expect(regions.find((r) => r.id === "manual-1")).toMatchObject({
+ startMs: 50,
+ endMs: 1000,
+ });
+ expect(regions.find((r) => r.id === "annotation-1")).toMatchObject({
+ startMs: 0,
+ endMs: 120,
+ });
+ expect(regions.find((r) => r.id === "annotation-2")).toMatchObject({
+ startMs: 130,
+ endMs: 300,
+ });
+ });
+});
diff --git a/src/lib/captioning/annotationsFromCaptions.ts b/src/lib/captioning/annotationsFromCaptions.ts
new file mode 100644
index 000000000..0f6dc2af4
--- /dev/null
+++ b/src/lib/captioning/annotationsFromCaptions.ts
@@ -0,0 +1,618 @@
+import type { AnnotationRegion, AnnotationTextStyle } from "@/components/video-editor/types";
+
+import type { CaptionSegment } from "./transcribe";
+
+/** Wide lower-third bar; `position.x` is top-left as % of container, so center with (100 − width) / 2. */
+const CAPTION_WIDTH = 92;
+const CAPTION_HEIGHT = 12;
+const CAPTION_BOTTOM_MARGIN = 2;
+
+const CAPTION_POSITION = {
+ x: (100 - CAPTION_WIDTH) / 2,
+ y: 100 - CAPTION_HEIGHT - CAPTION_BOTTOM_MARGIN,
+};
+
+const CAPTION_SIZE = { width: CAPTION_WIDTH, height: CAPTION_HEIGHT };
+
+const CAPTION_STYLE: AnnotationTextStyle = {
+ color: "#ffffff",
+ backgroundColor: "rgba(255, 255, 255, 0)",
+ fontSize: 24,
+ fontFamily: "Inter",
+ fontWeight: "normal",
+ fontStyle: "normal",
+ textDecoration: "none",
+ textAlign: "center",
+};
+
+/**
+ * Nudge caption **starts** earlier (seconds). Whisper onsets are often slightly late vs. what you
+ * hear; do **not** apply the same offset to ends — that pulls lines off-screen too early.
+ */
+const AUTO_CAPTION_START_BIAS_SEC = 0;
+
+/**
+ * Extra time held after Whisper’s segment **end** (seconds). Model end times are often early vs.
+ * trailing vowels / room tone; this is separate from `AUTO_CAPTION_START_BIAS_SEC`.
+ */
+const AUTO_CAPTION_END_HOLD_SEC = 0;
+
+/** Inside one Whisper phrase, sub-lines can be shorter (do not steal time from neighbors). */
+const WORD_SPLIT_MIN_SPAN_SEC = 0.02;
+
+/** Brief linger after the last word in a line (seconds); trimmed if it would overlap the next line. */
+const CAPTION_LINE_END_TAIL_SEC = 0;
+
+/** A real silence between word-level timestamps should start a new caption run. */
+const WORD_RUN_BREAK_GAP_SEC = 0.24;
+
+/**
+ * Minimum time between consecutive caption regions on the timeline (seconds). Keeps a visible gap
+ * so blocks do not read as one clip; kept small so we do not erase natural short pauses between phrases.
+ */
+const MIN_CAPTION_TIMELINE_GAP_SEC = 0;
+
+/** Same text again with almost no gap or overlap — common Whisper / chunk artifact. */
+const DEDUPE_SAME_TEXT_MAX_GAP_SEC = 0.55;
+
+export const SAME_CONTENT_ECHO_MAX_GAP_SEC = 1.15;
+
+function normalizeCaptionKey(text: string): string {
+ return text
+ .trim()
+ .replace(/\s+/g, " ")
+ .replace(/[\u2018\u2019]/g, "'")
+ .replace(/[\u201C\u201D]/g, '"')
+ .toLowerCase()
+ .replace(/[.!?,;:]+$/g, "");
+}
+
+/** Legacy echo-collapse helper kept for reference while phrase timing uses raw model spans. */
+export function collapseSameContentEchoes(segments: CaptionSegment[]): CaptionSegment[] {
+ const sorted = [...segments]
+ .filter((s) => s.text.trim())
+ .sort((a, b) => a.startSec - b.startSec || a.endSec - b.endSec);
+ const out: CaptionSegment[] = [];
+ const lastIndexByKey = new Map();
+
+ for (const seg of sorted) {
+ const key = normalizeCaptionKey(seg.text);
+ const hit = lastIndexByKey.get(key);
+ if (hit !== undefined) {
+ const prev = out[hit]!;
+ if (seg.startSec < prev.endSec + SAME_CONTENT_ECHO_MAX_GAP_SEC) {
+ prev.startSec = Math.min(prev.startSec, seg.startSec);
+ prev.endSec = Math.max(prev.endSec, seg.endSec);
+ continue;
+ }
+ }
+ out.push({
+ startSec: seg.startSec,
+ endSec: seg.endSec,
+ text: seg.text.trim(),
+ });
+ lastIndexByKey.set(key, out.length - 1);
+ }
+ return out;
+}
+
+/**
+ * Only merge segments that are almost back-to-back (Whisper often splits mid-phrase with a tiny gap).
+ * Wider gaps are usually silence or missed audio — merging those stretches word timing across dead air.
+ */
+/**
+ * Collapse adjacent duplicate lines (overlapping or tiny gap). Does not merge the same phrase
+ * repeated later in the video when separated by real silence.
+ */
+function dedupeAdjacentCaptionRepeats(segments: CaptionSegment[]): CaptionSegment[] {
+ const sorted = [...segments]
+ .filter((s) => s.text.trim())
+ .sort((a, b) => a.startSec - b.startSec || a.endSec - b.endSec);
+ const out: CaptionSegment[] = [];
+ for (const seg of sorted) {
+ const t = seg.text.trim();
+ const prev = out[out.length - 1];
+ if (prev && normalizeCaptionKey(prev.text) === normalizeCaptionKey(t)) {
+ const overlap = prev.endSec - seg.startSec;
+ const gap = seg.startSec - prev.endSec;
+ if (overlap > 0.015 || gap < DEDUPE_SAME_TEXT_MAX_GAP_SEC) {
+ prev.startSec = Math.min(prev.startSec, seg.startSec);
+ prev.endSec = Math.max(prev.endSec, seg.endSec);
+ continue;
+ }
+ }
+ out.push({ startSec: seg.startSec, endSec: seg.endSec, text: t });
+ }
+ return out;
+}
+
+/** Trim only real overlaps. Avoid synthetic lead/lag so caption timing matches model output. */
+function finalizeCaptionSegmentsForPlayback(segments: CaptionSegment[]): CaptionSegment[] {
+ const OVERLAP_TRIM_SEC = 0.002;
+
+ const sortedRaw = [...segments]
+ .filter((s) => s.text.trim())
+ .sort((a, b) => a.startSec - b.startSec || a.endSec - b.endSec);
+
+ const a = sortedRaw.map((seg) => {
+ let s = seg.startSec + AUTO_CAPTION_START_BIAS_SEC;
+ let e = seg.endSec + AUTO_CAPTION_END_HOLD_SEC;
+ s = Math.max(0, s);
+ if (e <= s) e = s + 0.02;
+ return { startSec: s, endSec: e, text: seg.text.trim() };
+ });
+
+ for (let i = 1; i < a.length; i++) {
+ if (a[i].startSec < a[i - 1].endSec - OVERLAP_TRIM_SEC) {
+ a[i - 1].endSec = Math.max(a[i - 1].startSec + 1e-4, a[i].startSec);
+ }
+ }
+
+ return a;
+}
+
+/** Default min gap between auto-caption blocks on the timeline (ms); matches `MIN_CAPTION_TIMELINE_GAP_SEC`. */
+export const DEFAULT_AUTO_CAPTION_MIN_GAP_MS = Math.round(MIN_CAPTION_TIMELINE_GAP_SEC * 1000);
+
+/**
+ * Enforces a minimum gap between consecutive `auto-caption` regions (by start time). Shortens the
+ * previous region's end when possible; otherwise shifts the following region later so edits on
+ * the timeline cannot squeeze caption blocks completely flush.
+ */
+export function reconcileAutoCaptionTimelineGaps(
+ regions: AnnotationRegion[],
+ minGapMs: number = DEFAULT_AUTO_CAPTION_MIN_GAP_MS,
+): AnnotationRegion[] {
+ const gap = Math.max(0, Math.round(minGapMs));
+ if (regions.length === 0 || gap === 0) return regions;
+
+ const autoCandidates = regions.filter((r) => r.annotationSource === "auto-caption");
+ if (autoCandidates.length <= 1) return regions;
+
+ const sorted = [...autoCandidates].sort((a, b) => a.startMs - b.startMs || a.endMs - b.endMs);
+ const fixed: AnnotationRegion[] = [];
+ let prev = { ...sorted[0]! };
+ fixed.push(prev);
+
+ for (let i = 1; i < sorted.length; i++) {
+ let cur = { ...sorted[i]! };
+ const minStart = prev.endMs + gap;
+
+ if (cur.startMs < minStart) {
+ const newPrevEnd = cur.startMs - gap;
+ if (newPrevEnd >= prev.startMs + 1) {
+ prev = { ...prev, endMs: newPrevEnd };
+ fixed[fixed.length - 1] = prev;
+ } else {
+ const dur = Math.max(1, cur.endMs - cur.startMs);
+ cur = { ...cur, startMs: minStart, endMs: minStart + dur };
+ }
+ }
+
+ fixed.push(cur);
+ prev = cur;
+ }
+
+ const fixedById = new Map(fixed.map((r) => [r.id, r]));
+ return regions.map((r) => fixedById.get(r.id) ?? r);
+}
+
+/** Join phrases that are close in time so the editor does not create dozens of separate overlays. */
+export function mergeAdjacentCaptionSegments(
+ segments: CaptionSegment[],
+ options?: { maxGapSec?: number; maxChars?: number; maxBlockDurationSec?: number },
+): CaptionSegment[] {
+ const maxGapSec = options?.maxGapSec ?? 1.35;
+ const maxChars = options?.maxChars ?? 320;
+ const maxBlockDurationSec = options?.maxBlockDurationSec ?? 12;
+
+ const sorted = [...segments].sort((a, b) => a.startSec - b.startSec);
+ const out: CaptionSegment[] = [];
+
+ for (const seg of sorted) {
+ const text = seg.text.trim();
+ if (!text) continue;
+
+ const prev = out[out.length - 1];
+ if (!prev) {
+ out.push({ startSec: seg.startSec, endSec: seg.endSec, text });
+ continue;
+ }
+
+ const gap = seg.startSec - prev.endSec;
+ const mergedText = `${prev.text} ${text}`.trim();
+ const mergedEnd = Math.max(prev.endSec, seg.endSec);
+ const wouldSpan = mergedEnd - prev.startSec;
+ if (gap <= maxGapSec && mergedText.length <= maxChars && wouldSpan <= maxBlockDurationSec) {
+ prev.endSec = mergedEnd;
+ prev.text = mergedText;
+ } else {
+ out.push({ startSec: seg.startSec, endSec: seg.endSec, text });
+ }
+ }
+
+ return out;
+}
+
+function partitionPhraseCaptionSegments(
+ segments: CaptionSegment[],
+ options?: { maxGapSec?: number; maxChars?: number; maxBlockDurationSec?: number },
+): CaptionSegment[][] {
+ const maxGapSec = options?.maxGapSec ?? 0;
+ const maxChars = options?.maxChars ?? Number.POSITIVE_INFINITY;
+ const maxBlockDurationSec = options?.maxBlockDurationSec ?? Number.POSITIVE_INFINITY;
+
+ const sorted = [...segments]
+ .filter((s) => s.text.trim())
+ .sort((a, b) => a.startSec - b.startSec || a.endSec - b.endSec);
+ if (sorted.length === 0) return [];
+
+ const groups: CaptionSegment[][] = [];
+ let current: CaptionSegment[] = [];
+
+ for (const seg of sorted) {
+ const text = seg.text.trim();
+ if (!text) continue;
+
+ if (current.length === 0) {
+ current.push({ ...seg, text });
+ continue;
+ }
+
+ const prev = current[current.length - 1]!;
+ const groupStart = current[0]!.startSec;
+ const gap = seg.startSec - prev.endSec;
+ const currentChars = current.reduce((sum, item) => sum + item.text.length, 0);
+ const wouldChars = currentChars + 1 + text.length;
+ const wouldSpan = Math.max(prev.endSec, seg.endSec) - groupStart;
+
+ if (gap <= maxGapSec && wouldChars <= maxChars && wouldSpan <= maxBlockDurationSec) {
+ current.push({ ...seg, text });
+ continue;
+ }
+
+ groups.push(current);
+ current = [{ ...seg, text }];
+ }
+
+ if (current.length > 0) {
+ groups.push(current);
+ }
+
+ return groups;
+}
+
+export interface CaptionSegmentLayoutOptions {
+ /** Lower bound on words per on-screen caption (default 2). */
+ minWordsPerCaption?: number;
+ /** Upper bound on words per on-screen caption (default 7). */
+ maxWordsPerCaption?: number;
+ /**
+ * `word`: each `CaptionSegment` is a single token with Whisper word timestamps (default).
+ * `phrase`: merged phrase spans; use proportional line splitting inside each span.
+ */
+ timestampGranularity?: "word" | "phrase";
+}
+
+function computeCaptionLineIndexRanges(
+ wordCount: number,
+ minWords: number,
+ maxWords: number,
+): Array<{ from: number; to: number }> {
+ const minW = Math.max(1, Math.min(Math.floor(minWords), Math.floor(maxWords)));
+ const maxW = Math.max(minW, Math.floor(maxWords));
+ const sliceRanges: Array<{ from: number; to: number }> = [];
+ let i = 0;
+ while (i < wordCount) {
+ const remaining = wordCount - i;
+ if (remaining <= maxW) {
+ if (sliceRanges.length > 0 && remaining < minW) {
+ sliceRanges[sliceRanges.length - 1]!.to = wordCount;
+ } else {
+ sliceRanges.push({ from: i, to: wordCount });
+ }
+ break;
+ }
+
+ let take = maxW;
+ const after = remaining - take;
+ if (after > 0 && after < minW) {
+ take = remaining - minW;
+ if (take < minW) {
+ sliceRanges.push({ from: i, to: wordCount });
+ break;
+ }
+ if (take > maxW) {
+ take = maxW;
+ }
+ }
+ sliceRanges.push({ from: i, to: i + take });
+ i += take;
+ }
+ return sliceRanges;
+}
+
+/**
+ * Groups per-word segments into on-screen lines using each token's Whisper timestamps
+ * (no proportional stretching across a long phrase span).
+ */
+export function groupTimedCaptionWordsIntoLines(
+ segments: CaptionSegment[],
+ minWords: number,
+ maxWords: number,
+): CaptionSegment[] {
+ const words = [...segments]
+ .filter((s) => s.text.trim())
+ .sort((a, b) => a.startSec - b.startSec || a.endSec - b.endSec);
+ if (words.length === 0) return [];
+
+ const minW = Math.max(1, Math.min(Math.floor(minWords), Math.floor(maxWords)));
+ const maxW = Math.max(minW, Math.floor(maxWords));
+ const out: CaptionSegment[] = [];
+
+ let runStart = 0;
+ const flushRun = (runEndExclusive: number) => {
+ const run = words.slice(runStart, runEndExclusive);
+ if (run.length === 0) return;
+ const ranges = computeCaptionLineIndexRanges(run.length, minW, maxW);
+ for (const { from, to } of ranges) {
+ const slice = run.slice(from, to);
+ const s = slice[0]!.startSec;
+ const rawEnd = slice[slice.length - 1]!.endSec;
+ const e = Math.max(s + WORD_SPLIT_MIN_SPAN_SEC, rawEnd + CAPTION_LINE_END_TAIL_SEC);
+ out.push({
+ startSec: s,
+ endSec: e,
+ text: slice.map((w) => w.text.trim()).join(" "),
+ });
+ }
+ };
+
+ for (let i = 1; i < words.length; i++) {
+ const prev = words[i - 1]!;
+ const cur = words[i]!;
+ const gap = cur.startSec - prev.endSec;
+ if (gap >= WORD_RUN_BREAK_GAP_SEC) {
+ flushRun(i);
+ runStart = i;
+ }
+ }
+ flushRun(words.length);
+
+ for (let i = 0; i < out.length - 1; i++) {
+ if (out[i]!.endSec > out[i + 1]!.startSec + 1e-3) {
+ out[i]!.endSec = Math.max(
+ out[i]!.startSec + WORD_SPLIT_MIN_SPAN_SEC,
+ out[i + 1]!.startSec - 1e-4,
+ );
+ }
+ }
+ return out;
+}
+
+/**
+ * Splits each merged transcription span into shorter captions with about
+ * `minWords`–`maxWords` words. Times are interpolated by character weight inside the span.
+ */
+export function splitMergedCaptionsByWordBounds(
+ merged: CaptionSegment[],
+ minWords: number,
+ maxWords: number,
+): CaptionSegment[] {
+ const minW = Math.max(1, Math.min(Math.floor(minWords), Math.floor(maxWords)));
+ const maxW = Math.max(minW, Math.floor(maxWords));
+ const out: CaptionSegment[] = [];
+
+ for (const seg of merged) {
+ const words = seg.text.trim().split(/\s+/).filter(Boolean);
+ if (words.length === 0) continue;
+
+ if (words.length <= maxW) {
+ out.push({
+ startSec: seg.startSec,
+ endSec: seg.endSec,
+ text: words.join(" "),
+ });
+ continue;
+ }
+
+ out.push(...splitOneSegmentByWordBounds(seg.startSec, seg.endSec, words, minW, maxW));
+ }
+
+ return out;
+}
+
+function wrapCaptionTextByWordBounds(text: string, minWords: number, maxWords: number): string {
+ const words = text.trim().split(/\s+/).filter(Boolean);
+ if (words.length === 0) return "";
+ const minW = Math.max(1, Math.min(Math.floor(minWords), Math.floor(maxWords)));
+ const maxW = Math.max(minW, Math.floor(maxWords));
+ const ranges = computeCaptionLineIndexRanges(words.length, minW, maxW);
+ return ranges.map(({ from, to }) => words.slice(from, to).join(" ")).join("\n");
+}
+
+function expandPhraseSegmentToPseudoWords(segment: CaptionSegment): CaptionSegment[] {
+ const words = segment.text.trim().split(/\s+/).filter(Boolean);
+ if (words.length === 0) return [];
+ if (words.length === 1) {
+ return [
+ {
+ startSec: segment.startSec,
+ endSec: segment.endSec,
+ text: words[0]!,
+ },
+ ];
+ }
+
+ return splitOneSegmentByWordBounds(segment.startSec, segment.endSec, words, 1, 1);
+}
+
+export function groupPhraseCaptionSegmentsIntoLines(
+ segments: CaptionSegment[],
+ minWords: number,
+ maxWords: number,
+ options?: { maxGapSec?: number; maxChars?: number; maxBlockDurationSec?: number },
+): CaptionSegment[] {
+ const groups = partitionPhraseCaptionSegments(segments, options);
+ const out: CaptionSegment[] = [];
+
+ for (const group of groups) {
+ if (group.length === 1) {
+ const only = group[0]!;
+ const wrapped = wrapCaptionTextByWordBounds(only.text, minWords, maxWords).trim();
+ if (!wrapped) continue;
+ const lineTexts = wrapped
+ .split("\n")
+ .map((t) => t.trim())
+ .filter(Boolean);
+ const n = lineTexts.length;
+ const rawDur = only.endSec - only.startSec;
+ if (n > 1 && rawDur < n * WORD_SPLIT_MIN_SPAN_SEC) {
+ out.push({
+ startSec: only.startSec,
+ endSec: only.endSec,
+ text: lineTexts.join(" "),
+ });
+ continue;
+ }
+ const dur = Math.max(rawDur, WORD_SPLIT_MIN_SPAN_SEC * n);
+ if (n <= 1) {
+ out.push({
+ startSec: only.startSec,
+ endSec: only.endSec,
+ text: lineTexts[0] ?? wrapped,
+ });
+ continue;
+ }
+ for (let i = 0; i < n; i++) {
+ const startSec = only.startSec + (dur * i) / n;
+ const boundary = only.startSec + (dur * (i + 1)) / n;
+ const endSec =
+ i === n - 1 ? only.endSec : Math.max(startSec + WORD_SPLIT_MIN_SPAN_SEC, boundary);
+ out.push({
+ startSec,
+ endSec,
+ text: lineTexts[i]!,
+ });
+ }
+ continue;
+ }
+
+ const pseudoWords = group.flatMap(expandPhraseSegmentToPseudoWords);
+ out.push(...groupTimedCaptionWordsIntoLines(pseudoWords, minWords, maxWords));
+ }
+
+ return out;
+}
+
+function splitOneSegmentByWordBounds(
+ startSec: number,
+ endSec: number,
+ words: string[],
+ minWords: number,
+ maxWords: number,
+): CaptionSegment[] {
+ const sliceRanges = computeCaptionLineIndexRanges(words.length, minWords, maxWords);
+
+ const dur = Math.max(endSec - startSec, 0.05);
+ const weights = words.map((w) => Math.max(1, w.length));
+ const totalW = weights.reduce((a, b) => a + b, 0);
+
+ const weightSum = (from: number, to: number) => {
+ let s = 0;
+ for (let k = from; k < to; k++) s += weights[k] ?? 0;
+ return s;
+ };
+
+ const result: CaptionSegment[] = [];
+ let prevEnd = startSec;
+ for (const { from, to } of sliceRanges) {
+ const wb = weightSum(0, from);
+ const ws = weightSum(from, to);
+ let s = startSec + (wb / totalW) * dur;
+ let e = startSec + ((wb + ws) / totalW) * dur;
+ s = Math.max(s, prevEnd);
+ e = Math.max(s + WORD_SPLIT_MIN_SPAN_SEC, e);
+ e = Math.min(e, endSec);
+ if (e <= s) {
+ e = Math.min(endSec, s + WORD_SPLIT_MIN_SPAN_SEC);
+ }
+ prevEnd = e;
+ result.push({
+ startSec: s,
+ endSec: e,
+ text: words.slice(from, to).join(" "),
+ });
+ }
+ if (result.length > 0) {
+ result[result.length - 1].endSec = endSec;
+ for (let i = 0; i < result.length - 1; i++) {
+ if (result[i].endSec > result[i + 1].startSec + 0.002) {
+ result[i].endSec = Math.max(result[i].startSec + 1e-4, result[i + 1].startSec);
+ }
+ }
+ }
+ return result;
+}
+
+export function captionSegmentsToAnnotationRegions(
+ segments: CaptionSegment[],
+ startNumericId: number,
+ startZIndex: number,
+ layout?: CaptionSegmentLayoutOptions,
+): { regions: AnnotationRegion[]; nextNumericId: number; nextZIndex: number } {
+ // Do not echo-collapse raw word tokens before grouping: repeated words ("I … I") share a
+ // normalized key and would merge spans while keeping only the first token's text.
+ const minW = layout?.minWordsPerCaption ?? 2;
+ const maxW = layout?.maxWordsPerCaption ?? 7;
+ const granularity = layout?.timestampGranularity ?? "word";
+
+ const grouped =
+ granularity === "phrase"
+ ? groupPhraseCaptionSegmentsIntoLines(segments, minW, maxW)
+ : groupTimedCaptionWordsIntoLines(segments, minW, maxW);
+
+ const dedupedOut = dedupeAdjacentCaptionRepeats(grouped);
+ const finalized = finalizeCaptionSegmentsForPlayback(dedupedOut);
+
+ let nid = startNumericId;
+ let z = startZIndex;
+ const regions: AnnotationRegion[] = [];
+
+ for (const seg of finalized) {
+ const startMs = Math.round(seg.startSec * 1000);
+ const endMs = Math.max(Math.round(seg.endSec * 1000), startMs + 1);
+ regions.push({
+ id: `annotation-${nid++}`,
+ startMs,
+ endMs,
+ type: "text",
+ content: seg.text,
+ annotationSource: "auto-caption",
+ position: { ...CAPTION_POSITION },
+ size: { ...CAPTION_SIZE },
+ style: { ...CAPTION_STYLE },
+ zIndex: z++,
+ });
+ }
+
+ return {
+ regions: reconcileAutoCaptionTimelineGaps(regions),
+ nextNumericId: nid,
+ nextZIndex: z,
+ };
+}
+
+export function maxAnnotationNumericId(regions: AnnotationRegion[]): number {
+ let max = 0;
+ for (const r of regions) {
+ const m = /^annotation-(\d+)$/.exec(r.id);
+ if (m) max = Math.max(max, Number.parseInt(m[1], 10));
+ }
+ return max;
+}
+
+export function maxAnnotationZIndex(regions: AnnotationRegion[]): number {
+ if (regions.length === 0) return 0;
+ return Math.max(...regions.map((r) => r.zIndex));
+}
diff --git a/src/lib/captioning/captionConstants.ts b/src/lib/captioning/captionConstants.ts
new file mode 100644
index 000000000..1bacb7cc7
--- /dev/null
+++ b/src/lib/captioning/captionConstants.ts
@@ -0,0 +1,2 @@
+/** Max audio length for auto-captions (decode + transcribe); keep demuxer read aligned with this. */
+export const MAX_CAPTION_AUDIO_SEC = 4 * 60 * 60;
diff --git a/src/lib/captioning/extractMono16k.ts b/src/lib/captioning/extractMono16k.ts
new file mode 100644
index 000000000..53258567c
--- /dev/null
+++ b/src/lib/captioning/extractMono16k.ts
@@ -0,0 +1,159 @@
+import { MAX_CAPTION_AUDIO_SEC } from "./captionConstants";
+import { extractMonoPcmViaWebDemuxer } from "./extractMono16kWebDemuxer";
+
+export { MAX_CAPTION_AUDIO_SEC };
+
+const FETCH_TIMEOUT_MS = 120_000;
+
+async function fetchWithTimeout(url: string, signal?: AbortSignal): Promise {
+ const ctrl = new AbortController();
+ const timer = window.setTimeout(() => ctrl.abort(), FETCH_TIMEOUT_MS);
+ const onAbort = () => ctrl.abort();
+ if (signal) {
+ if (signal.aborted) ctrl.abort();
+ else signal.addEventListener("abort", onAbort, { once: true });
+ }
+ try {
+ return await fetch(url, { signal: ctrl.signal });
+ } finally {
+ window.clearTimeout(timer);
+ if (signal) signal.removeEventListener("abort", onAbort);
+ }
+}
+
+/**
+ * Load the editor video the same way as `StreamingVideoDecoder`:
+ * Electron `readBinaryFile` for local paths (fetch(file://) is unreliable in the renderer),
+ * otherwise HTTP / blob / data URLs via fetch.
+ */
+async function loadSourceVideoFile(videoUrl: string, signal?: AbortSignal): Promise {
+ const isRemoteUrl = /^(https?:|blob:|data:)/i.test(videoUrl);
+
+ if (!isRemoteUrl && window.electronAPI?.readBinaryFile) {
+ const result = await window.electronAPI.readBinaryFile(videoUrl);
+ if (!result.success || !result.data) {
+ throw new Error(result.message || result.error || "Failed to read source video");
+ }
+ const filename = (result.path || videoUrl).split(/[\\/]/).pop() || "video";
+ return new File([result.data], filename, { type: "video/webm" });
+ }
+
+ const response = await fetchWithTimeout(videoUrl, signal);
+ if (!response.ok) {
+ throw new Error(`Failed to load video for captions: ${response.status} ${response.statusText}`);
+ }
+ const blob = await response.blob();
+ if (signal?.aborted) throw new DOMException("Aborted", "AbortError");
+ const filename = videoUrl.split("/").pop() || "video";
+ return new File([blob], filename, { type: blob.type || "video/webm" });
+}
+
+function mixToMono(audioBuffer: AudioBuffer): Float32Array {
+ const { length, numberOfChannels } = audioBuffer;
+ const out = new Float32Array(length);
+ if (numberOfChannels === 0) return out;
+ for (let i = 0; i < length; i++) {
+ let sum = 0;
+ for (let c = 0; c < numberOfChannels; c++) {
+ sum += audioBuffer.getChannelData(c)[i];
+ }
+ out[i] = sum / numberOfChannels;
+ }
+ return out;
+}
+
+async function resampleMono(
+ mono: Float32Array,
+ fromRate: number,
+ toRate: number,
+ signal?: AbortSignal,
+): Promise {
+ if (signal?.aborted) throw new DOMException("Aborted", "AbortError");
+ if (fromRate === toRate) return mono;
+ const durationSec = mono.length / fromRate;
+ const outLength = Math.max(1, Math.ceil(durationSec * toRate));
+ const offline = new OfflineAudioContext(1, outLength, toRate);
+ const buf = offline.createBuffer(1, mono.length, fromRate);
+ buf.copyToChannel(Float32Array.from(mono), 0);
+ const src = offline.createBufferSource();
+ src.buffer = buf;
+ src.connect(offline.destination);
+ src.start(0);
+ const rendered = await offline.startRendering();
+ if (signal?.aborted) throw new DOMException("Aborted", "AbortError");
+ return rendered.getChannelData(0).slice();
+}
+
+async function truncateAndResampleTo16k(
+ mono: Float32Array,
+ fromRate: number,
+ durationSec: number,
+ signal?: AbortSignal,
+): Promise<{ samples: Float32Array; truncated: boolean; durationSec: number }> {
+ let truncated = false;
+ let work = mono;
+ if (durationSec > MAX_CAPTION_AUDIO_SEC) {
+ const maxSamples = Math.floor(MAX_CAPTION_AUDIO_SEC * fromRate);
+ work = mono.subarray(0, Math.min(mono.length, maxSamples));
+ truncated = true;
+ }
+
+ const samples = await resampleMono(work, fromRate, 16_000, signal);
+ return { samples, truncated, durationSec: samples.length / 16_000 };
+}
+
+/**
+ * Decode the video's audio track to mono 16 kHz float samples (Whisper input).
+ * Prefers `decodeAudioData` when the container is supported; otherwise uses the same
+ * web-demuxer + AudioDecoder path as export.
+ */
+export async function extractMono16kFromVideoUrl(
+ videoUrl: string,
+ options?: { signal?: AbortSignal },
+): Promise<{ samples: Float32Array; truncated: boolean; durationSec: number }> {
+ const file = await loadSourceVideoFile(videoUrl, options?.signal);
+
+ /** When this returns null, use web-demuxer + AudioDecoder (same as export). */
+ const tryDecodeAudioDataPath = async (): Promise<{
+ samples: Float32Array;
+ truncated: boolean;
+ durationSec: number;
+ } | null> => {
+ const audioContext = new AudioContext();
+ try {
+ const ab = await file.arrayBuffer();
+ if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
+ const audioBuffer = await audioContext.decodeAudioData(ab.slice(0));
+ if (
+ audioBuffer.numberOfChannels === 0 ||
+ audioBuffer.length === 0 ||
+ !Number.isFinite(audioBuffer.duration) ||
+ audioBuffer.duration <= 0
+ ) {
+ return null;
+ }
+ const durationSec = audioBuffer.duration;
+ const mono = mixToMono(audioBuffer);
+ const fromRate = audioBuffer.sampleRate;
+ const out = await truncateAndResampleTo16k(mono, fromRate, durationSec, options?.signal);
+ // decodeAudioData can resolve for some WebM/Matroska inputs yet yield almost no usable
+ // PCM; captions only run the demuxer path on throw today, so we never recover.
+ if (out.samples.length < 800) {
+ return null;
+ }
+ return out;
+ } catch {
+ return null;
+ } finally {
+ await audioContext.close().catch(() => undefined);
+ }
+ };
+
+ const primary = await tryDecodeAudioDataPath();
+ if (primary) {
+ return primary;
+ }
+
+ const pcm = await extractMonoPcmViaWebDemuxer(file, options?.signal);
+ return truncateAndResampleTo16k(pcm.mono, pcm.sampleRate, pcm.durationSec, options?.signal);
+}
diff --git a/src/lib/captioning/extractMono16kWebDemuxer.ts b/src/lib/captioning/extractMono16kWebDemuxer.ts
new file mode 100644
index 000000000..fd85f5703
--- /dev/null
+++ b/src/lib/captioning/extractMono16kWebDemuxer.ts
@@ -0,0 +1,187 @@
+import { WebDemuxer } from "web-demuxer";
+
+import { MAX_CAPTION_AUDIO_SEC } from "./captionConstants";
+
+const DECODE_QUEUE_BACKPRESSURE = 20;
+const SOURCE_LOAD_TIMEOUT_MS = 60_000;
+const READ_END_PADDING_SEC = 0.5;
+
+function webDemuxerWasmUrl(): string {
+ return new URL("../exporter/wasm/web-demuxer.wasm", window.location.href).href;
+}
+
+function audioDataFrameToMono(frame: AudioData): Float32Array {
+ const frames = frame.numberOfFrames;
+ const ch = frame.numberOfChannels;
+ const out = new Float32Array(frames);
+ const fmt = frame.format || "";
+ const planar = fmt.includes("planar");
+
+ if (planar) {
+ const plane = new Float32Array(frames);
+ for (let c = 0; c < ch; c++) {
+ frame.copyTo(plane, { planeIndex: c });
+ for (let i = 0; i < frames; i++) {
+ out[i] += plane[i];
+ }
+ }
+ for (let i = 0; i < frames; i++) {
+ out[i] /= ch;
+ }
+ } else {
+ const interleaved = new Float32Array(frames * ch);
+ frame.copyTo(interleaved, { planeIndex: 0 });
+ for (let i = 0; i < frames; i++) {
+ let sum = 0;
+ for (let c = 0; c < ch; c++) {
+ sum += interleaved[i * ch + c];
+ }
+ out[i] = sum / ch;
+ }
+ }
+ return out;
+}
+
+function mergeAndConsumeDecodedAudioToMonoLinear(
+ frames: AudioData[],
+ sampleRate: number,
+ durationSec: number,
+): Float32Array {
+ const sorted = [...frames].sort((a, b) => a.timestamp - b.timestamp);
+ const totalSamples = Math.max(1, Math.ceil(durationSec * sampleRate));
+ const acc = new Float32Array(totalSamples);
+ const weight = new Float32Array(totalSamples);
+
+ for (const frame of sorted) {
+ const startSample = Math.round((frame.timestamp / 1e6) * sampleRate);
+ const slice = audioDataFrameToMono(frame);
+ for (let i = 0; i < slice.length; i++) {
+ const pos = startSample + i;
+ if (pos >= 0 && pos < totalSamples) {
+ acc[pos] += slice[i];
+ weight[pos] += 1;
+ }
+ }
+ frame.close();
+ }
+
+ for (let i = 0; i < totalSamples; i++) {
+ if (weight[i] > 0) {
+ acc[i] /= weight[i];
+ }
+ }
+ return acc;
+}
+
+function withTimeout(promise: Promise, ms: number, message: string): Promise {
+ return new Promise((resolve, reject) => {
+ const id = window.setTimeout(() => reject(new Error(message)), ms);
+ promise
+ .then((v) => {
+ window.clearTimeout(id);
+ resolve(v);
+ })
+ .catch((e) => {
+ window.clearTimeout(id);
+ reject(e instanceof Error ? e : new Error(String(e)));
+ });
+ });
+}
+
+/**
+ * Demux + WebCodecs audio decode (same stack as export). Use when
+ * `decodeAudioData` cannot handle the container (e.g. WebM with video).
+ */
+export async function extractMonoPcmViaWebDemuxer(
+ file: File,
+ signal?: AbortSignal,
+): Promise<{ mono: Float32Array; sampleRate: number; durationSec: number }> {
+ const demuxer = new WebDemuxer({ wasmFilePath: webDemuxerWasmUrl() });
+ await withTimeout(
+ demuxer.load(file),
+ SOURCE_LOAD_TIMEOUT_MS,
+ "Timed out while parsing the source video for captions.",
+ );
+
+ if (signal?.aborted) throw new DOMException("Aborted", "AbortError");
+
+ const mediaInfo = await withTimeout(
+ demuxer.getMediaInfo(),
+ SOURCE_LOAD_TIMEOUT_MS,
+ "Timed out while reading media info for captions.",
+ );
+
+ const reportedDurationSec =
+ Number.isFinite(mediaInfo.duration) && mediaInfo.duration > 0 ? mediaInfo.duration : 0;
+
+ let audioConfig: AudioDecoderConfig;
+ try {
+ audioConfig = await demuxer.getDecoderConfig("audio");
+ } catch {
+ throw new Error("No audio track found in this video.");
+ }
+
+ const codecCheck = await AudioDecoder.isConfigSupported(audioConfig);
+ if (!codecCheck.supported) {
+ throw new Error(`Audio codec not supported for captions: ${audioConfig.codec}`);
+ }
+
+ const sampleRate = audioConfig.sampleRate || 48_000;
+
+ // Many WebM/Matroska files report a too-short duration; capping read at reported time stops
+ // demux early and mergeAndConsumeDecodedAudioToMonoLinear clips everything past that. Read up to the
+ // same ceiling as caption decode (demuxer stops when the track ends).
+ const readEndSec = MAX_CAPTION_AUDIO_SEC + READ_END_PADDING_SEC;
+ const decodedFrames: AudioData[] = [];
+
+ const decoder = new AudioDecoder({
+ output: (data: AudioData) => decodedFrames.push(data),
+ error: (e: DOMException) => console.error("[captioning] AudioDecoder error:", e),
+ });
+ decoder.configure(audioConfig);
+
+ const reader = demuxer.read("audio", 0, readEndSec).getReader();
+ try {
+ while (!signal?.aborted) {
+ const { done, value: chunk } = await reader.read();
+ if (done || !chunk) break;
+ decoder.decode(chunk);
+ while (decoder.decodeQueueSize > DECODE_QUEUE_BACKPRESSURE && !signal?.aborted) {
+ await new Promise((r) => setTimeout(r, 1));
+ }
+ }
+ } finally {
+ try {
+ await reader.cancel();
+ } catch {
+ /* already closed */
+ }
+ }
+
+ if (decoder.state === "configured") {
+ await decoder.flush();
+ decoder.close();
+ }
+
+ if (signal?.aborted) {
+ for (const f of decodedFrames) f.close();
+ throw new DOMException("Aborted", "AbortError");
+ }
+
+ if (decodedFrames.length === 0) {
+ throw new Error("Decoded zero audio frames from this video.");
+ }
+
+ let maxEndUs = 0;
+ for (const f of decodedFrames) {
+ const end = f.timestamp + (f.duration ?? 0);
+ if (end > maxEndUs) maxEndUs = end;
+ }
+ const inferredDurationSec = maxEndUs / 1e6;
+ // Prefer extent implied by decoded frames (fixes bad container duration). If frames lack
+ // duration, fall back to reported metadata.
+ const durationSec = inferredDurationSec > 0.02 ? inferredDurationSec : reportedDurationSec;
+
+ const mono = mergeAndConsumeDecodedAudioToMonoLinear(decodedFrames, sampleRate, durationSec);
+ return { mono, sampleRate, durationSec };
+}
diff --git a/src/lib/captioning/index.ts b/src/lib/captioning/index.ts
new file mode 100644
index 000000000..cc2e2a3a6
--- /dev/null
+++ b/src/lib/captioning/index.ts
@@ -0,0 +1,17 @@
+export type { CaptionSegmentLayoutOptions } from "./annotationsFromCaptions";
+export {
+ captionSegmentsToAnnotationRegions,
+ DEFAULT_AUTO_CAPTION_MIN_GAP_MS,
+ groupTimedCaptionWordsIntoLines,
+ mergeAdjacentCaptionSegments,
+ reconcileAutoCaptionTimelineGaps,
+ splitMergedCaptionsByWordBounds,
+} from "./annotationsFromCaptions";
+export { extractMono16kFromVideoUrl, MAX_CAPTION_AUDIO_SEC } from "./extractMono16k";
+export { shiftTrimRegionsMsForCaptionBuffer, trimLeadingSilenceMono16k } from "./leadingSilence";
+export type {
+ CaptionSegment,
+ CaptionTimestampGranularity,
+ TranscribeMono16kResult,
+} from "./transcribe";
+export { transcribeMono16kToSegments } from "./transcribe";
diff --git a/src/lib/captioning/leadingSilence.ts b/src/lib/captioning/leadingSilence.ts
new file mode 100644
index 000000000..4bd6a11aa
--- /dev/null
+++ b/src/lib/captioning/leadingSilence.ts
@@ -0,0 +1,78 @@
+/** Caption path is always mono 16 kHz after `extractMono16kFromVideoUrl`. */
+import type { TrimRegion } from "@/components/video-editor/types";
+
+const SAMPLE_RATE = 16_000;
+
+/** Window length for peak detection (~50 ms). */
+const WINDOW_SAMPLES = 800;
+
+/** Coarse hop so long intros scan quickly (~50 ms steps). */
+const HOP_SAMPLES = 800;
+
+/** Max |sample| in a window below this counts as silence (float PCM ~[-1, 1]). */
+const PEAK_THRESHOLD = 0.012;
+
+/** Keep a little audio before the first peak so word onsets are not clipped. */
+const PRE_ROLL_SEC = 0.12;
+
+/** Do not scan more than this much audio for leading silence (performance + pathological files). */
+const MAX_LEADING_SCAN_SEC = 15 * 60;
+
+/**
+ * Drops quiet audio at the beginning so Whisper is not fed a long silent prefix (which can skew
+ * the first phrase and wastes work). Returned `trimSec` must be added back to every segment time.
+ */
+export function trimLeadingSilenceMono16k(samples: Float32Array): {
+ samples: Float32Array;
+ trimSec: number;
+} {
+ if (samples.length < WINDOW_SAMPLES) {
+ return { samples, trimSec: 0 };
+ }
+
+ const maxIndex = Math.min(
+ samples.length - WINDOW_SAMPLES,
+ Math.floor(MAX_LEADING_SCAN_SEC * SAMPLE_RATE),
+ );
+
+ let firstSpeechSample = -1;
+ for (let i = 0; i <= maxIndex; i += HOP_SAMPLES) {
+ let peak = 0;
+ for (let j = 0; j < WINDOW_SAMPLES; j++) {
+ peak = Math.max(peak, Math.abs(samples[i + j]!));
+ }
+ if (peak > PEAK_THRESHOLD) {
+ firstSpeechSample = i;
+ break;
+ }
+ }
+
+ if (firstSpeechSample <= 0) {
+ return { samples, trimSec: 0 };
+ }
+
+ const preRollSamples = Math.round(PRE_ROLL_SEC * SAMPLE_RATE);
+ const start = Math.max(0, firstSpeechSample - preRollSamples);
+ return {
+ samples: samples.subarray(start),
+ trimSec: start / SAMPLE_RATE,
+ };
+}
+
+/**
+ * When audio is trimmed from the front, Whisper times are relative to the shortened buffer.
+ * Shift trim regions by the same offset so `segmentOverlapsTrim` still uses consistent coordinates.
+ */
+export function shiftTrimRegionsMsForCaptionBuffer(
+ regions: TrimRegion[],
+ trimMs: number,
+): TrimRegion[] {
+ if (trimMs <= 0) return regions;
+ return regions
+ .map((r) => ({
+ ...r,
+ startMs: Math.max(0, r.startMs - trimMs),
+ endMs: Math.max(0, r.endMs - trimMs),
+ }))
+ .filter((r) => r.endMs > r.startMs);
+}
diff --git a/src/lib/captioning/transcribe.ts b/src/lib/captioning/transcribe.ts
new file mode 100644
index 000000000..91f1d91f0
--- /dev/null
+++ b/src/lib/captioning/transcribe.ts
@@ -0,0 +1,91 @@
+import type { TrimRegion } from "@/components/video-editor/types";
+
+export interface CaptionSegment {
+ startSec: number;
+ endSec: number;
+ text: string;
+}
+
+/** How caption layout should interpret `CaptionSegment` times from `transcribeMono16kToSegments`. */
+export type CaptionTimestampGranularity = "word" | "phrase";
+
+export interface TranscribeMono16kResult {
+ segments: CaptionSegment[];
+ granularity: CaptionTimestampGranularity;
+}
+
+/** Request payload posted from the renderer to the transcription worker. */
+export interface TranscribeWorkerRequest {
+ samples: Float32Array;
+ trimRegions: TrimRegion[];
+}
+
+/** Messages the transcription worker posts back to the renderer. */
+export type TranscribeWorkerResponse =
+ | { type: "status"; phase: "model" | "transcribe" }
+ | { type: "result"; segments: CaptionSegment[]; granularity: CaptionTimestampGranularity }
+ | { type: "error"; message: string };
+
+/**
+ * Transcribes mono 16 kHz audio into timed caption segments using in-browser Whisper.
+ *
+ * The model load and inference run inside a dedicated Web Worker so the editor's
+ * main thread stays responsive (WASM inference does not yield). The first run
+ * downloads model weights. Aborting (via `options.signal`) terminates the worker
+ * immediately, since model load / inference cannot be cooperatively cancelled.
+ */
+export function transcribeMono16kToSegments(
+ samples: Float32Array,
+ options?: {
+ trimRegions?: TrimRegion[];
+ onStatus?: (phase: "model" | "transcribe") => void;
+ signal?: AbortSignal;
+ },
+): Promise {
+ if (options?.signal?.aborted) {
+ return Promise.reject(new DOMException("Aborted", "AbortError"));
+ }
+
+ return new Promise((resolve, reject) => {
+ const worker = new Worker(new URL("./transcribe.worker.ts", import.meta.url), {
+ type: "module",
+ });
+
+ let settled = false;
+ const finish = (fn: () => void) => {
+ if (settled) return;
+ settled = true;
+ options?.signal?.removeEventListener("abort", onAbort);
+ worker.terminate();
+ fn();
+ };
+
+ const onAbort = () => finish(() => reject(new DOMException("Aborted", "AbortError")));
+ options?.signal?.addEventListener("abort", onAbort, { once: true });
+
+ worker.onmessage = (e: MessageEvent) => {
+ const msg = e.data;
+ if (msg.type === "status") {
+ options?.onStatus?.(msg.phase);
+ return;
+ }
+ if (msg.type === "result") {
+ finish(() => resolve({ segments: msg.segments, granularity: msg.granularity }));
+ return;
+ }
+ finish(() => reject(new Error(msg.message)));
+ };
+
+ worker.onerror = (e) => {
+ finish(() => reject(new Error(e.message || "Caption transcription worker failed")));
+ };
+
+ // Structured-clone copy (not a transfer): the caller may reuse `samples`
+ // for the full-buffer retry pass, so the buffer must stay valid here.
+ const request: TranscribeWorkerRequest = {
+ samples,
+ trimRegions: options?.trimRegions ?? [],
+ };
+ worker.postMessage(request);
+ });
+}
diff --git a/src/lib/captioning/transcribe.worker.ts b/src/lib/captioning/transcribe.worker.ts
new file mode 100644
index 000000000..edd16e8ec
--- /dev/null
+++ b/src/lib/captioning/transcribe.worker.ts
@@ -0,0 +1,81 @@
+/**
+ * Web Worker: runs in-browser Whisper transcription off the renderer's main
+ * thread so the editor UI never blocks while the model loads or audio is
+ * transcribed.
+ *
+ * Input message: { samples: Float32Array; trimRegions: TrimRegion[] }
+ * Output messages (see `TranscribeWorkerResponse`):
+ * { type: "status", phase: "model" | "transcribe" } progress updates
+ * { type: "result", segments, granularity } final captions
+ * { type: "error", message } failure detail
+ *
+ * The caller terminates this worker to abort (model load / inference cannot be
+ * cooperatively cancelled), so there is no in-worker abort handling.
+ */
+
+import type { TranscribeWorkerRequest, TranscribeWorkerResponse } from "./transcribe";
+import { runTranscription, type TranscriberFn } from "./transcribeCore";
+
+function post(message: TranscribeWorkerResponse): void {
+ (self as unknown as Worker).postMessage(message);
+}
+
+/**
+ * ONNX Runtime's wasm bundle treats `process.versions.node` (which can leak into
+ * an Electron worker) as Node and tries `require("fs")`, which Vite does not
+ * support. Mask it only while Transformers / ORT run. No-op when `process` is
+ * undefined (the usual case in a Web Worker).
+ */
+function withoutNodeVersion(fn: () => Promise): Promise {
+ const versions =
+ typeof process !== "undefined" && process.versions && typeof process.versions === "object"
+ ? process.versions
+ : null;
+ const hadNode = versions !== null && "node" in versions;
+ const savedNode = hadNode ? (versions as { node?: string }).node : undefined;
+ if (hadNode && versions) {
+ try {
+ Reflect.deleteProperty(versions, "node");
+ } catch {
+ (versions as { node?: string }).node = undefined;
+ }
+ }
+ return fn().finally(() => {
+ if (hadNode && versions && savedNode !== undefined) {
+ (versions as { node: string }).node = savedNode;
+ }
+ });
+}
+
+async function loadTranscriber(): Promise {
+ return withoutNodeVersion(async () => {
+ const { pipeline, env } = await import("@xenova/transformers");
+ env.allowLocalModels = false;
+ // Default tiny weights only: the `output_attentions` revision has regressed inference for
+ // some environments (empty chunks / thrown errors) while phrase mode works on this model.
+ const transcriber = (await pipeline(
+ "automatic-speech-recognition",
+ "Xenova/whisper-tiny",
+ )) as unknown as TranscriberFn;
+ return transcriber;
+ });
+}
+
+self.onmessage = async (event: MessageEvent) => {
+ const { samples, trimRegions } = event.data;
+ try {
+ post({ type: "status", phase: "model" });
+ const transcriber = await loadTranscriber();
+
+ post({ type: "status", phase: "transcribe" });
+ const { segments, granularity } = await runTranscription(
+ transcriber,
+ samples,
+ trimRegions ?? [],
+ );
+
+ post({ type: "result", segments, granularity });
+ } catch (e) {
+ post({ type: "error", message: e instanceof Error ? e.message : String(e) });
+ }
+};
diff --git a/src/lib/captioning/transcribeCore.ts b/src/lib/captioning/transcribeCore.ts
new file mode 100644
index 000000000..111995246
--- /dev/null
+++ b/src/lib/captioning/transcribeCore.ts
@@ -0,0 +1,269 @@
+import type { TrimRegion } from "@/components/video-editor/types";
+import type { CaptionSegment, TranscribeMono16kResult } from "./transcribe";
+
+/**
+ * Pure transcription algorithm shared by the captioning Web Worker. It takes an
+ * already-constructed Whisper `transcriber` and turns mono 16 kHz audio into
+ * timed caption segments. Kept free of DOM / Transformers.js imports so it can
+ * run inside a worker and be unit-tested in isolation.
+ */
+
+/** A Transformers.js automatic-speech-recognition pipeline call. */
+export type TranscriberFn = (
+ audio: Float32Array,
+ opts: Record,
+) => Promise;
+
+function segmentOverlapsTrim(startMs: number, endMs: number, trims: TrimRegion[]): boolean {
+ return trims.some((t) => startMs < t.endMs && endMs > t.startMs);
+}
+
+/** Same trim-out rule as {@link segmentsFromTranscriberChunks}; for retry passes that used empty trims. */
+function dropSegmentsOverlappingTrimRegions(
+ segments: CaptionSegment[],
+ trimRegions: TrimRegion[],
+): CaptionSegment[] {
+ if (trimRegions.length === 0) return segments;
+ return segments.filter((s) => {
+ const startMs = Math.round(s.startSec * 1000);
+ const endMs = Math.round(s.endSec * 1000);
+ return !segmentOverlapsTrim(startMs, endMs, trimRegions);
+ });
+}
+
+/** Whisper runs with internal 30s chunks; keep each forward pass bounded for WASM memory. */
+const TRANSCRIBE_SLICE_SAMPLES = 12 * 60 * 16_000;
+
+/** Very short slices are skipped in the multi-slice loop unless padded (see `padTailSliceForTranscribe`). */
+const MIN_TRANSCRIBE_SLICE_SAMPLES = 800;
+
+/**
+ * Pad a short tail slice so Whisper still runs; timestamps are clamped with `realDurationSec` so
+ * padding does not extend perceived audio on the timeline.
+ */
+function padTailSliceForTranscribe(samples: Float32Array): {
+ slice: Float32Array;
+ realDurationSec: number;
+} {
+ const realDurationSec = samples.length / 16_000;
+ if (samples.length >= MIN_TRANSCRIBE_SLICE_SAMPLES) {
+ return { slice: samples, realDurationSec };
+ }
+ const padded = new Float32Array(MIN_TRANSCRIBE_SLICE_SAMPLES);
+ padded.set(samples);
+ return { slice: padded, realDurationSec };
+}
+
+/** Converts raw Whisper chunk output into sorted, deduped, trim-filtered caption segments. */
+function segmentsFromTranscriberChunks(
+ chunks: Array<{ timestamp?: [number | null, number | null]; text?: unknown }>,
+ timeOffsetSec: number,
+ trims: TrimRegion[],
+ audioDurationSec: number,
+): CaptionSegment[] {
+ const sorted = [...chunks].sort((x, y) => {
+ const ax = x.timestamp?.[0];
+ const ay = y.timestamp?.[0];
+ const na = typeof ax === "number" ? ax : -1;
+ const nb = typeof ay === "number" ? ay : -1;
+ return na - nb;
+ });
+
+ const segments: CaptionSegment[] = [];
+
+ for (let idx = 0; idx < sorted.length; idx++) {
+ const c = sorted[idx]!;
+ const ts = c.timestamp as [number | null, number | null] | undefined;
+ if (!ts) continue;
+ let a = ts[0];
+ let b = ts[1];
+ if (a == null) a = 0;
+ a = Math.max(0, a);
+ if (b == null) {
+ let nextStart: number | null = null;
+ for (let j = idx + 1; j < sorted.length; j++) {
+ const na = sorted[j]?.timestamp?.[0];
+ if (typeof na === "number") {
+ nextStart = na;
+ break;
+ }
+ }
+ b = nextStart ?? audioDurationSec;
+ }
+ if (b <= a) {
+ b = Math.min(a + 0.25, audioDurationSec);
+ }
+ b = Math.min(b, audioDurationSec);
+
+ const text = String(c.text ?? "")
+ .replace(/\s+/g, " ")
+ .trim();
+ if (!text) continue;
+
+ const startSec = a + timeOffsetSec;
+ const sliceEnd = timeOffsetSec + audioDurationSec;
+ const endSec = Math.min(Math.max(startSec + 0.08, b + timeOffsetSec), sliceEnd);
+ const startMs = Math.round(startSec * 1000);
+ const endMs = Math.round(endSec * 1000);
+ if (segmentOverlapsTrim(startMs, endMs, trims)) continue;
+
+ segments.push({ startSec, endSec, text });
+ }
+
+ segments.sort((u, v) => u.startSec - v.startSec || u.endSec - v.endSec);
+ const rawDeduped: CaptionSegment[] = [];
+ for (const seg of segments) {
+ const prev = rawDeduped[rawDeduped.length - 1];
+ if (prev && prev.text === seg.text && seg.startSec <= prev.endSec) {
+ prev.endSec = Math.max(prev.endSec, seg.endSec);
+ prev.startSec = Math.min(prev.startSec, seg.startSec);
+ continue;
+ }
+ rawDeduped.push(seg);
+ }
+ return rawDeduped;
+}
+
+/** Runs the transcriber on one audio slice, chunking only long clips. */
+async function runTranscriberOnSlice(
+ transcriber: TranscriberFn,
+ samples: Float32Array,
+ opts: { forceFullSequences: boolean; timestampMode: "word" | "phrase" },
+): Promise {
+ const durationSec = samples.length / 16_000;
+ // Only chunk long clips; short-audio chunking regressed some Whisper.js runs (empty chunks).
+ const chunking = durationSec > 30 ? { chunk_length_s: 30, stride_length_s: 5 } : {};
+ return transcriber(samples, {
+ return_timestamps: opts.timestampMode === "word" ? "word" : true,
+ force_full_sequences: opts.forceFullSequences,
+ ...chunking,
+ });
+}
+
+/** Flattens the various shapes a Transformers.js ASR result can take into a chunk list. */
+function getChunksFromTranscriberResult(result: unknown): Array<{
+ timestamp?: [number | null, number | null];
+ text?: unknown;
+}> {
+ if (result == null) return [];
+ if (Array.isArray(result)) {
+ const out: Array<{ timestamp?: [number | null, number | null]; text?: unknown }> = [];
+ for (const item of result) {
+ const chunks = (item as { chunks?: unknown })?.chunks;
+ if (Array.isArray(chunks)) out.push(...chunks);
+ }
+ return out;
+ }
+ const chunks = (result as { chunks?: unknown })?.chunks;
+ return Array.isArray(chunks) ? chunks : [];
+}
+
+/** Prefer `chunks`; if the model only returned top-level `text`, synthesize one span for timing. */
+function extractChunksFromAsrResult(result: unknown): Array<{
+ timestamp?: [number | null, number | null];
+ text?: unknown;
+}> {
+ const fromChunks = getChunksFromTranscriberResult(result);
+ if (fromChunks.length > 0) return fromChunks;
+ const single = Array.isArray(result) ? result[0] : result;
+ const text =
+ typeof (single as { text?: unknown })?.text === "string"
+ ? String((single as { text: string }).text).trim()
+ : "";
+ if (text) {
+ return [{ timestamp: [0, null], text }];
+ }
+ return [];
+}
+
+/**
+ * Drives Whisper over (possibly sliced) mono 16 kHz audio and returns timed segments.
+ * Long audio is split so one forward pass does not exhaust WASM memory; timestamps are
+ * shifted back onto the full timeline. Tries word- then phrase-level timestamps, with a
+ * trim-ignoring retry, before giving up.
+ */
+export async function runTranscription(
+ transcriber: TranscriberFn,
+ samples: Float32Array,
+ trims: TrimRegion[],
+): Promise {
+ const transcribeOne = async (
+ ignoreTrims: boolean,
+ forceFullSequences: boolean,
+ timestampMode: "word" | "phrase",
+ ): Promise => {
+ try {
+ const activeTrims = ignoreTrims ? [] : trims;
+ if (samples.length <= TRANSCRIBE_SLICE_SAMPLES) {
+ const { slice, realDurationSec } = padTailSliceForTranscribe(samples);
+ const result = await runTranscriberOnSlice(transcriber, slice, {
+ forceFullSequences,
+ timestampMode,
+ });
+ return segmentsFromTranscriberChunks(
+ extractChunksFromAsrResult(result),
+ 0,
+ activeTrims,
+ realDurationSec,
+ );
+ }
+
+ const all: CaptionSegment[] = [];
+ for (let offset = 0; offset < samples.length; offset += TRANSCRIBE_SLICE_SAMPLES) {
+ const end = Math.min(offset + TRANSCRIBE_SLICE_SAMPLES, samples.length);
+ const sliceRaw = samples.subarray(offset, end);
+ const isFinalSlice = end >= samples.length;
+ if (sliceRaw.length === 0) continue;
+ if (sliceRaw.length < MIN_TRANSCRIBE_SLICE_SAMPLES && !isFinalSlice) continue;
+
+ const { slice, realDurationSec } =
+ sliceRaw.length < MIN_TRANSCRIBE_SLICE_SAMPLES && isFinalSlice
+ ? padTailSliceForTranscribe(sliceRaw)
+ : { slice: sliceRaw, realDurationSec: sliceRaw.length / 16_000 };
+
+ const result = await runTranscriberOnSlice(transcriber, slice, {
+ forceFullSequences,
+ timestampMode,
+ });
+ const tOff = offset / 16_000;
+ all.push(
+ ...segmentsFromTranscriberChunks(
+ extractChunksFromAsrResult(result),
+ tOff,
+ activeTrims,
+ realDurationSec,
+ ),
+ );
+ }
+ return all;
+ } catch (e) {
+ console.warn("[captioning] Whisper pass failed:", e);
+ return [];
+ }
+ };
+
+ const attemptModes: Array<"word" | "phrase"> = ["word", "phrase"];
+ for (const timestampMode of attemptModes) {
+ let segments = await transcribeOne(false, true, timestampMode);
+ if (segments.length === 0) {
+ segments = await transcribeOne(false, false, timestampMode);
+ }
+ if (segments.length === 0 && trims.length > 0) {
+ segments = dropSegmentsOverlappingTrimRegions(
+ await transcribeOne(true, true, timestampMode),
+ trims,
+ );
+ if (segments.length === 0) {
+ segments = dropSegmentsOverlappingTrimRegions(
+ await transcribeOne(true, false, timestampMode),
+ trims,
+ );
+ }
+ }
+ if (segments.length > 0) {
+ return { segments, granularity: timestampMode };
+ }
+ }
+
+ return { segments: [], granularity: "phrase" };
+}
diff --git a/src/lib/vite-stubs/empty-node-module.ts b/src/lib/vite-stubs/empty-node-module.ts
new file mode 100644
index 000000000..16ee52688
--- /dev/null
+++ b/src/lib/vite-stubs/empty-node-module.ts
@@ -0,0 +1,7 @@
+/**
+ * Default export with no enumerable keys. Used as a Vite alias target for Node
+ * builtins that `@xenova/transformers` imports; `env.js` treats an empty object
+ * as “no filesystem” so it stays on browser / remote paths.
+ */
+const empty = Object.create(null) as Record;
+export default empty;
diff --git a/src/lib/vite-stubs/onnxruntime-node-stub.ts b/src/lib/vite-stubs/onnxruntime-node-stub.ts
new file mode 100644
index 000000000..a70b3dd60
--- /dev/null
+++ b/src/lib/vite-stubs/onnxruntime-node-stub.ts
@@ -0,0 +1,10 @@
+/**
+ * Transformers always imports `onnxruntime-node`, then picks web vs node from `process.release.name`.
+ * In Electron's renderer that name is often `"node"` while we still must use the WASM build — the real
+ * `onnxruntime-node` package is aliased away (it pulls `fs`). Re-export `onnxruntime-web` here so the
+ * "node" branch still receives a working ORT with `registerBackend` etc.
+ */
+import * as ortWeb from "onnxruntime-web";
+
+const ort = (ortWeb as { default?: typeof ortWeb }).default ?? ortWeb;
+export default ort;
diff --git a/vite.config.ts b/vite.config.ts
index 0779e1358..213e44711 100644
--- a/vite.config.ts
+++ b/vite.config.ts
@@ -28,8 +28,22 @@ export default defineConfig({
resolve: {
alias: {
"@": path.resolve(__dirname, "src"),
+ // @xenova/transformers: env.js statically imports fs/path/url; onnx.js imports
+ // onnxruntime-node (must not be bundled in the renderer — it requires fs).
+ fs: path.resolve(__dirname, "src/lib/vite-stubs/empty-node-module.ts"),
+ path: path.resolve(__dirname, "src/lib/vite-stubs/empty-node-module.ts"),
+ url: path.resolve(__dirname, "src/lib/vite-stubs/empty-node-module.ts"),
+ "onnxruntime-node": path.resolve(__dirname, "src/lib/vite-stubs/onnxruntime-node-stub.ts"), // re-exports web ORT
},
},
+ optimizeDeps: {
+ exclude: ["@xenova/transformers"],
+ },
+ // The captioning worker dynamically imports @xenova/transformers, which makes the
+ // worker bundle code-split — unsupported by the default "iife" worker format.
+ worker: {
+ format: "es",
+ },
build: {
target: "esnext",
minify: "terser",