From 7216f255a069a6a50cd9c1ae24eb51ddf68f47f7 Mon Sep 17 00:00:00 2001
From: parse-nip <152457438+parse-nip@users.noreply.github.com>
Date: Wed, 13 May 2026 22:33:10 +0800
Subject: [PATCH 1/7] feat: add video transcriptions (in-browser auto captions)

Decode mono 16k audio from the editor video, run Whisper via Transformers.js, and insert linked text annotations with timing and layout helpers.

Adds Vite resolve shims for Node-only imports used by the model stack, optional leading-silence trim for the caption buffer, timeline gap reconciliation for auto-caption regions, and editor i18n. Raises Select content z-index so caption controls stay usable over the video surface.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 package-lock.json                             | 757 +++++++++++++++++-
 package.json                                  |   1 +
 src/components/ui/select.tsx                  |   3 +-
 src/components/video-editor/VideoEditor.tsx   | 299 ++++++-
 .../video-editor/projectPersistence.ts        |   2 +
 src/components/video-editor/types.ts          |   2 +
 src/i18n/locales/ar/editor.json               |  18 +
 src/i18n/locales/en/editor.json               |  18 +
 src/i18n/locales/es/editor.json               |  18 +
 src/i18n/locales/fr/editor.json               |  20 +-
 src/i18n/locales/ja-JP/editor.json            |  18 +
 src/i18n/locales/ko-KR/editor.json            |  18 +
 src/i18n/locales/tr/editor.json               |  18 +
 src/i18n/locales/zh-CN/editor.json            |  18 +
 src/i18n/locales/zh-TW/editor.json            |  18 +
 src/lib/captioning/annotationsFromCaptions.ts | 499 ++++++++++++
 src/lib/captioning/captionConstants.ts        |   2 +
 src/lib/captioning/extractMono16k.ts          | 159 ++++
 .../captioning/extractMono16kWebDemuxer.ts    | 187 +++++
 src/lib/captioning/index.ts                   |  17 +
 src/lib/captioning/leadingSilence.ts          |  83 ++
 src/lib/captioning/transcribe.ts              | 286 +++++++
 src/lib/vite-stubs/empty-node-module.ts       |   7 +
 src/lib/vite-stubs/onnxruntime-node-stub.ts   |  10 +
 vite.config.ts                                |   9 +
 25 files changed, 2441 insertions(+), 46 deletions(-)
 create mode 100644 src/lib/captioning/annotationsFromCaptions.ts
 create mode 100644 src/lib/captioning/captionConstants.ts
 create mode 100644 src/lib/captioning/extractMono16k.ts
 create mode 100644 src/lib/captioning/extractMono16kWebDemuxer.ts
 create mode 100644 src/lib/captioning/index.ts
 create mode 100644 src/lib/captioning/leadingSilence.ts
 create mode 100644 src/lib/captioning/transcribe.ts
 create mode 100644 src/lib/vite-stubs/empty-node-module.ts
 create mode 100644 src/lib/vite-stubs/onnxruntime-node-stub.ts

diff --git a/package-lock.json b/package-lock.json
index e823ad1c0..7bbcf035c 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,13 @@
 {
 	"name": "openscreen",
-	"version": "1.3.0",
+	"version": "1.4.0",
 	"lockfileVersion": 3,
 	"requires": true,
 	"packages": {
 		"": {
 			"name": "openscreen",
-			"version": "1.3.0",
+			"version": "1.4.0",
+			"hasInstallScript": true,
 			"dependencies": {
 				"@fix-webm-duration/fix": "^1.0.1",
 				"@pixi/filter-drop-shadow": "^5.2.0",
@@ -26,6 +27,7 @@
 				"@uiw/color-convert": "^2.10.1",
 				"@uiw/react-color-block": "^2.10.1",
 				"@uiw/react-color-colorful": "^2.9.2",
+				"@xenova/transformers": "^2.17.2",
 				"class-variance-authority": "^0.7.1",
 				"clsx": "^2.1.1",
 				"dnd-timeline": "^2.4.0",
@@ -1773,6 +1775,15 @@
 			"integrity": "sha512-RiB/yIh78pcIxl6lLMG0CgBXAZ2Y0eVHqMPYugu+9U0AeT6YBeiJpf7lbdJNIugFP5SIjwNRgo4DhR1Qxi26Gg==",
 			"license": "MIT"
 		},
+		"node_modules/@huggingface/jinja": {
+			"version": "0.2.2",
+			"resolved": "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.2.2.tgz",
+			"integrity": "sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA==",
+			"license": "MIT",
+			"engines": {
+				"node": ">=18"
+			}
+		},
 		"node_modules/@isaacs/fs-minipass": {
 			"version": "4.0.1",
 			"resolved": "https://registry.npmjs.org/@isaacs/fs-minipass/-/fs-minipass-4.0.1.tgz",
@@ -2105,6 +2116,70 @@
 			"dev": true,
 			"license": "MIT"
 		},
+		"node_modules/@protobufjs/aspromise": {
+			"version": "1.1.2",
+			"resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz",
+			"integrity": "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==",
+			"license": "BSD-3-Clause"
+		},
+		"node_modules/@protobufjs/base64": {
+			"version": "1.1.2",
+			"resolved": "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz",
+			"integrity": "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==",
+			"license": "BSD-3-Clause"
+		},
+		"node_modules/@protobufjs/codegen": {
+			"version": "2.0.5",
+			"resolved": "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.5.tgz",
+			"integrity": "sha512-zgXFLzW3Ap33e6d0Wlj4MGIm6Ce8O89n/apUaGNB/jx+hw+ruWEp7EwGUshdLKVRCxZW12fp9r40E1mQrf/34g==",
+			"license": "BSD-3-Clause"
+		},
+		"node_modules/@protobufjs/eventemitter": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz",
+			"integrity": "sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==",
+			"license": "BSD-3-Clause"
+		},
+		"node_modules/@protobufjs/fetch": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz",
+			"integrity": "sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==",
+			"license": "BSD-3-Clause",
+			"dependencies": {
+				"@protobufjs/aspromise": "^1.1.1",
+				"@protobufjs/inquire": "^1.1.0"
+			}
+		},
+		"node_modules/@protobufjs/float": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz",
+			"integrity": "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==",
+			"license": "BSD-3-Clause"
+		},
+		"node_modules/@protobufjs/inquire": {
+			"version": "1.1.1",
+			"resolved": "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.1.tgz",
+			"integrity": "sha512-mnzgDV26ueAvk7rsbt9L7bE0SuAoqyuys/sMMrmVcN5x9VsxpcG3rqAUSgDyLp0UZlmNfIbQ4fHfCtreVBk8Ew==",
+			"license": "BSD-3-Clause"
+		},
+		"node_modules/@protobufjs/path": {
+			"version": "1.1.2",
+			"resolved": "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz",
+			"integrity": "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==",
+			"license": "BSD-3-Clause"
+		},
+		"node_modules/@protobufjs/pool": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz",
+			"integrity": "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==",
+			"license": "BSD-3-Clause"
+		},
+		"node_modules/@protobufjs/utf8": {
+			"version": "1.1.1",
+			"resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.1.tgz",
+			"integrity": "sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg==",
+			"license": "BSD-3-Clause"
+		},
 		"node_modules/@radix-ui/number": {
 			"version": "1.1.1",
 			"resolved": "https://registry.npmjs.org/@radix-ui/number/-/number-1.1.1.tgz",
@@ -3823,6 +3898,12 @@
 				"@types/node": "*"
 			}
 		},
+		"node_modules/@types/long": {
+			"version": "4.0.2",
+			"resolved": "https://registry.npmjs.org/@types/long/-/long-4.0.2.tgz",
+			"integrity": "sha512-MqTGEo5bj5t157U6fA/BiDynNkn0YknVdh48CMPkTSpFTVmvao5UQmm7uEF6xBEo7qIMAlY/JSleYaE6VOdpaA==",
+			"license": "MIT"
+		},
 		"node_modules/@types/ms": {
 			"version": "2.1.0",
 			"resolved": "https://registry.npmjs.org/@types/ms/-/ms-2.1.0.tgz",
@@ -3834,7 +3915,6 @@
 			"version": "22.19.17",
 			"resolved": "https://registry.npmjs.org/@types/node/-/node-22.19.17.tgz",
 			"integrity": "sha512-wGdMcf+vPYM6jikpS/qhg6WiqSV/OhG+jeeHT/KlVqxYfD40iYJf9/AE1uQxVWFvU7MipKRkRv8NSHiCGgPr8Q==",
-			"dev": true,
 			"license": "MIT",
 			"dependencies": {
 				"undici-types": "~6.21.0"
@@ -4294,6 +4374,20 @@
 			"integrity": "sha512-RPmm6kgRbI8e98zSD3RVACvnuktIja5+yLgDAkTmxLr90BEwdTXRQWNLF3ETTTyH/8mKhznZuN5AveXYFEsMGQ==",
 			"license": "BSD-3-Clause"
 		},
+		"node_modules/@xenova/transformers": {
+			"version": "2.17.2",
+			"resolved": "https://registry.npmjs.org/@xenova/transformers/-/transformers-2.17.2.tgz",
+			"integrity": "sha512-lZmHqzrVIkSvZdKZEx7IYY51TK0WDrC8eR0c5IMnBsO8di8are1zzw8BlLhyO2TklZKLN5UffNGs1IJwT6oOqQ==",
+			"license": "Apache-2.0",
+			"dependencies": {
+				"@huggingface/jinja": "^0.2.2",
+				"onnxruntime-web": "1.14.0",
+				"sharp": "^0.32.0"
+			},
+			"optionalDependencies": {
+				"onnxruntime-node": "1.14.0"
+			}
+		},
 		"node_modules/@xmldom/xmldom": {
 			"version": "0.8.13",
 			"resolved": "https://registry.npmjs.org/@xmldom/xmldom/-/xmldom-0.8.13.tgz",
@@ -4764,11 +4858,101 @@
 				"node": "18 || 20 || >=22"
 			}
 		},
+		"node_modules/bare-events": {
+			"version": "2.8.2",
+			"resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.8.2.tgz",
+			"integrity": "sha512-riJjyv1/mHLIPX4RwiK+oW9/4c3TEUeORHKefKAKnZ5kyslbN+HXowtbaVEqt4IMUB7OXlfixcs6gsFeo/jhiQ==",
+			"license": "Apache-2.0",
+			"peerDependencies": {
+				"bare-abort-controller": "*"
+			},
+			"peerDependenciesMeta": {
+				"bare-abort-controller": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/bare-fs": {
+			"version": "4.7.1",
+			"resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-4.7.1.tgz",
+			"integrity": "sha512-WDRsyVN52eAx/lBamKD6uyw8H4228h/x0sGGGegOamM2cd7Pag88GfMQalobXI+HaEUxpCkbKQUDOQqt9wawRw==",
+			"license": "Apache-2.0",
+			"dependencies": {
+				"bare-events": "^2.5.4",
+				"bare-path": "^3.0.0",
+				"bare-stream": "^2.6.4",
+				"bare-url": "^2.2.2",
+				"fast-fifo": "^1.3.2"
+			},
+			"engines": {
+				"bare": ">=1.16.0"
+			},
+			"peerDependencies": {
+				"bare-buffer": "*"
+			},
+			"peerDependenciesMeta": {
+				"bare-buffer": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/bare-os": {
+			"version": "3.9.1",
+			"resolved": "https://registry.npmjs.org/bare-os/-/bare-os-3.9.1.tgz",
+			"integrity": "sha512-6M5XjcnsygQNPMCMPXSK379xrJFiZ/AEMNBmFEmQW8d/789VQATvriyi5r0HYTL9TkQ26rn3kgdTG3aisbrXkQ==",
+			"license": "Apache-2.0",
+			"engines": {
+				"bare": ">=1.14.0"
+			}
+		},
+		"node_modules/bare-path": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/bare-path/-/bare-path-3.0.0.tgz",
+			"integrity": "sha512-tyfW2cQcB5NN8Saijrhqn0Zh7AnFNsnczRcuWODH0eYAXBsJ5gVxAUuNr7tsHSC6IZ77cA0SitzT+s47kot8Mw==",
+			"license": "Apache-2.0",
+			"dependencies": {
+				"bare-os": "^3.0.1"
+			}
+		},
+		"node_modules/bare-stream": {
+			"version": "2.13.1",
+			"resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.13.1.tgz",
+			"integrity": "sha512-Vp0cnjYyrEC4whYTymQ+YZi6pBpfiICZO3cfRG8sy67ZNWe951urv1x4eW1BKNngw3U+3fPYb5JQvHbCtxH7Ow==",
+			"license": "Apache-2.0",
+			"dependencies": {
+				"streamx": "^2.25.0",
+				"teex": "^1.0.1"
+			},
+			"peerDependencies": {
+				"bare-abort-controller": "*",
+				"bare-buffer": "*",
+				"bare-events": "*"
+			},
+			"peerDependenciesMeta": {
+				"bare-abort-controller": {
+					"optional": true
+				},
+				"bare-buffer": {
+					"optional": true
+				},
+				"bare-events": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/bare-url": {
+			"version": "2.4.3",
+			"resolved": "https://registry.npmjs.org/bare-url/-/bare-url-2.4.3.tgz",
+			"integrity": "sha512-Kccpc7ACfXaxfeInfqKcZtW4pT5YBn1mesc4sCsun6sRwtbJ4h+sNOaksUpYEJUKfN65YWC6Bw2OJEFiKxq8nQ==",
+			"license": "Apache-2.0",
+			"dependencies": {
+				"bare-path": "^3.0.0"
+			}
+		},
 		"node_modules/base64-js": {
 			"version": "1.5.1",
 			"resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz",
 			"integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==",
-			"dev": true,
 			"funding": [
 				{
 					"type": "github",
@@ -4820,6 +5004,17 @@
 				"url": "https://github.com/sponsors/sindresorhus"
 			}
 		},
+		"node_modules/bl": {
+			"version": "4.1.0",
+			"resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz",
+			"integrity": "sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==",
+			"license": "MIT",
+			"dependencies": {
+				"buffer": "^5.5.0",
+				"inherits": "^2.0.4",
+				"readable-stream": "^3.4.0"
+			}
+		},
 		"node_modules/boolean": {
 			"version": "3.2.0",
 			"resolved": "https://registry.npmjs.org/boolean/-/boolean-3.2.0.tgz",
@@ -4892,7 +5087,6 @@
 			"version": "5.7.1",
 			"resolved": "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz",
 			"integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==",
-			"dev": true,
 			"funding": [
 				{
 					"type": "github",
@@ -4908,7 +5102,6 @@
 				}
 			],
 			"license": "MIT",
-			"optional": true,
 			"dependencies": {
 				"base64-js": "^1.3.1",
 				"ieee754": "^1.1.13"
@@ -5307,11 +5500,23 @@
 				"node": ">=6"
 			}
 		},
+		"node_modules/color": {
+			"version": "4.2.3",
+			"resolved": "https://registry.npmjs.org/color/-/color-4.2.3.tgz",
+			"integrity": "sha512-1rXeuUUiGGrykh+CeBdu5Ie7OJwinCgQY0bc7GCRxy5xVHy+moaqkpL/jqQq0MtQOeYcrqEz4abc5f0KtU7W4A==",
+			"license": "MIT",
+			"dependencies": {
+				"color-convert": "^2.0.1",
+				"color-string": "^1.9.0"
+			},
+			"engines": {
+				"node": ">=12.5.0"
+			}
+		},
 		"node_modules/color-convert": {
 			"version": "2.0.1",
 			"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
 			"integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
-			"dev": true,
 			"license": "MIT",
 			"dependencies": {
 				"color-name": "~1.1.4"
@@ -5324,9 +5529,18 @@
 			"version": "1.1.4",
 			"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
 			"integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
-			"dev": true,
 			"license": "MIT"
 		},
+		"node_modules/color-string": {
+			"version": "1.9.1",
+			"resolved": "https://registry.npmjs.org/color-string/-/color-string-1.9.1.tgz",
+			"integrity": "sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==",
+			"license": "MIT",
+			"dependencies": {
+				"color-name": "^1.0.0",
+				"simple-swizzle": "^0.2.2"
+			}
+		},
 		"node_modules/colorette": {
 			"version": "2.0.20",
 			"resolved": "https://registry.npmjs.org/colorette/-/colorette-2.0.20.tgz",
@@ -5530,7 +5744,6 @@
 			"version": "6.0.0",
 			"resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz",
 			"integrity": "sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==",
-			"dev": true,
 			"license": "MIT",
 			"dependencies": {
 				"mimic-response": "^3.1.0"
@@ -5546,7 +5759,6 @@
 			"version": "3.1.0",
 			"resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz",
 			"integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==",
-			"dev": true,
 			"license": "MIT",
 			"engines": {
 				"node": ">=10"
@@ -5555,6 +5767,15 @@
 				"url": "https://github.com/sponsors/sindresorhus"
 			}
 		},
+		"node_modules/deep-extend": {
+			"version": "0.6.0",
+			"resolved": "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz",
+			"integrity": "sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==",
+			"license": "MIT",
+			"engines": {
+				"node": ">=4.0.0"
+			}
+		},
 		"node_modules/defer-to-connect": {
 			"version": "2.0.1",
 			"resolved": "https://registry.npmjs.org/defer-to-connect/-/defer-to-connect-2.0.1.tgz",
@@ -5623,6 +5844,15 @@
 				"node": ">=6"
 			}
 		},
+		"node_modules/detect-libc": {
+			"version": "2.1.2",
+			"resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.1.2.tgz",
+			"integrity": "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ==",
+			"license": "Apache-2.0",
+			"engines": {
+				"node": ">=8"
+			}
+		},
 		"node_modules/detect-node": {
 			"version": "2.1.0",
 			"resolved": "https://registry.npmjs.org/detect-node/-/detect-node-2.1.0.tgz",
@@ -6097,7 +6327,6 @@
 			"version": "1.4.5",
 			"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz",
 			"integrity": "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==",
-			"dev": true,
 			"license": "MIT",
 			"dependencies": {
 				"once": "^1.4.0"
@@ -6290,6 +6519,24 @@
 			"license": "MIT",
 			"peer": true
 		},
+		"node_modules/events-universal": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/events-universal/-/events-universal-1.0.1.tgz",
+			"integrity": "sha512-LUd5euvbMLpwOF8m6ivPCbhQeSiYVNb8Vs0fQ8QjXo0JTkEHpz8pxdQf0gStltaPpw0Cca8b39KxvK9cfKRiAw==",
+			"license": "Apache-2.0",
+			"dependencies": {
+				"bare-events": "^2.7.0"
+			}
+		},
+		"node_modules/expand-template": {
+			"version": "2.0.3",
+			"resolved": "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz",
+			"integrity": "sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==",
+			"license": "(MIT OR WTFPL)",
+			"engines": {
+				"node": ">=6"
+			}
+		},
 		"node_modules/expect-type": {
 			"version": "1.3.0",
 			"resolved": "https://registry.npmjs.org/expect-type/-/expect-type-1.3.0.tgz",
@@ -6369,6 +6616,12 @@
 			"dev": true,
 			"license": "MIT"
 		},
+		"node_modules/fast-fifo": {
+			"version": "1.3.2",
+			"resolved": "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz",
+			"integrity": "sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==",
+			"license": "MIT"
+		},
 		"node_modules/fast-glob": {
 			"version": "3.3.3",
 			"resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.3.tgz",
@@ -6504,6 +6757,12 @@
 			"integrity": "sha512-IKlE+pNvL2R+kVL1kEhUYqRxVqeFnjiIvHWDMLFXNaqyUdFXQM2wte44EfMYJNHkW16X991t2Zg8apKkhv7OBA==",
 			"license": "MIT"
 		},
+		"node_modules/flatbuffers": {
+			"version": "1.12.0",
+			"resolved": "https://registry.npmjs.org/flatbuffers/-/flatbuffers-1.12.0.tgz",
+			"integrity": "sha512-c7CZADjRcl6j0PlvFy0ZqXQ67qSEZfrVPynmnL+2zPc+NtMvrF8Y0QceMo7QqnSPc7+uWjUIAbvCQ5WIKlMVdQ==",
+			"license": "SEE LICENSE IN LICENSE.txt"
+		},
 		"node_modules/form-data": {
 			"version": "4.0.5",
 			"resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.5.tgz",
@@ -6562,6 +6821,12 @@
 				}
 			}
 		},
+		"node_modules/fs-constants": {
+			"version": "1.0.0",
+			"resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz",
+			"integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==",
+			"license": "MIT"
+		},
 		"node_modules/fs-extra": {
 			"version": "8.1.0",
 			"resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-8.1.0.tgz",
@@ -6717,6 +6982,12 @@
 				"js-binary-schema-parser": "^2.0.3"
 			}
 		},
+		"node_modules/github-from-package": {
+			"version": "0.0.0",
+			"resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz",
+			"integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==",
+			"license": "MIT"
+		},
 		"node_modules/glob": {
 			"version": "7.2.3",
 			"resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz",
@@ -6884,6 +7155,12 @@
 			"integrity": "sha512-dMW4CWBTUK1AEEDeZc1g4xpPGIrSf9fJF960qbTZmN/QwZIWY5wgliS6JWl9/25fpTGJrMRtSjGtOmPnfjZB+A==",
 			"license": "Standard 'no charge' license: https://gsap.com/standard-license."
 		},
+		"node_modules/guid-typescript": {
+			"version": "1.0.9",
+			"resolved": "https://registry.npmjs.org/guid-typescript/-/guid-typescript-1.0.9.tgz",
+			"integrity": "sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==",
+			"license": "ISC"
+		},
 		"node_modules/has-flag": {
 			"version": "4.0.0",
 			"resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
@@ -7094,7 +7371,6 @@
 			"version": "1.2.1",
 			"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
 			"integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==",
-			"dev": true,
 			"funding": [
 				{
 					"type": "github",
@@ -7109,8 +7385,7 @@
 					"url": "https://feross.org/support"
 				}
 			],
-			"license": "BSD-3-Clause",
-			"optional": true
+			"license": "BSD-3-Clause"
 		},
 		"node_modules/indent-string": {
 			"version": "4.0.0",
@@ -7138,9 +7413,20 @@
 			"version": "2.0.4",
 			"resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
 			"integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==",
-			"dev": true,
 			"license": "ISC"
 		},
+		"node_modules/ini": {
+			"version": "1.3.8",
+			"resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz",
+			"integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==",
+			"license": "ISC"
+		},
+		"node_modules/is-arrayish": {
+			"version": "0.3.4",
+			"resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.4.tgz",
+			"integrity": "sha512-m6UrgzFVUYawGBh1dUsWR5M2Clqic9RVXC/9f8ceNlv2IcO9j9J/z8UoCLPqtsPBFNzEpfR3xftohbfqDx8EQA==",
+			"license": "MIT"
+		},
 		"node_modules/is-binary-path": {
 			"version": "2.1.0",
 			"resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz",
@@ -7653,6 +7939,12 @@
 				"url": "https://github.com/chalk/slice-ansi?sponsor=1"
 			}
 		},
+		"node_modules/long": {
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/long/-/long-4.0.0.tgz",
+			"integrity": "sha512-XsP+KhQif4bjX1kbuSiySJFNAehNxgLb6hPRGJ9QsUr8ajHkuXGdrHmFUTUUXhDwVX2R5bY4JNZEwbUiMhV+MA==",
+			"license": "Apache-2.0"
+		},
 		"node_modules/loose-envify": {
 			"version": "1.4.0",
 			"resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz",
@@ -7885,7 +8177,6 @@
 			"version": "1.2.8",
 			"resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
 			"integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==",
-			"dev": true,
 			"license": "MIT",
 			"funding": {
 				"url": "https://github.com/sponsors/ljharb"
@@ -7928,6 +8219,12 @@
 				"mkdirp": "bin/cmd.js"
 			}
 		},
+		"node_modules/mkdirp-classic": {
+			"version": "0.5.3",
+			"resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz",
+			"integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==",
+			"license": "MIT"
+		},
 		"node_modules/motion": {
 			"version": "12.38.0",
 			"resolved": "https://registry.npmjs.org/motion/-/motion-12.38.0.tgz",
@@ -8024,6 +8321,12 @@
 				"node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1"
 			}
 		},
+		"node_modules/napi-build-utils": {
+			"version": "2.0.0",
+			"resolved": "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-2.0.0.tgz",
+			"integrity": "sha512-GEbrYkbfF7MoNaoh2iGG84Mnf/WZfB0GdGEsM8wz7Expx/LlWf5U8t9nvJKXSp3qr5IsEbK04cBGhol/KwOsWA==",
+			"license": "MIT"
+		},
 		"node_modules/node-abi": {
 			"version": "4.28.0",
 			"resolved": "https://registry.npmjs.org/node-abi/-/node-abi-4.28.0.tgz",
@@ -8268,7 +8571,6 @@
 			"version": "1.4.0",
 			"resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
 			"integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
-			"dev": true,
 			"license": "ISC",
 			"dependencies": {
 				"wrappy": "1"
@@ -8290,6 +8592,50 @@
 				"url": "https://github.com/sponsors/sindresorhus"
 			}
 		},
+		"node_modules/onnx-proto": {
+			"version": "4.0.4",
+			"resolved": "https://registry.npmjs.org/onnx-proto/-/onnx-proto-4.0.4.tgz",
+			"integrity": "sha512-aldMOB3HRoo6q/phyB6QRQxSt895HNNw82BNyZ2CMh4bjeKv7g/c+VpAFtJuEMVfYLMbRx61hbuqnKceLeDcDA==",
+			"license": "MIT",
+			"dependencies": {
+				"protobufjs": "^6.8.8"
+			}
+		},
+		"node_modules/onnxruntime-common": {
+			"version": "1.14.0",
+			"resolved": "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.14.0.tgz",
+			"integrity": "sha512-3LJpegM2iMNRX2wUmtYfeX/ytfOzNwAWKSq1HbRrKc9+uqG/FsEA0bbKZl1btQeZaXhC26l44NWpNUeXPII7Ew==",
+			"license": "MIT"
+		},
+		"node_modules/onnxruntime-node": {
+			"version": "1.14.0",
+			"resolved": "https://registry.npmjs.org/onnxruntime-node/-/onnxruntime-node-1.14.0.tgz",
+			"integrity": "sha512-5ba7TWomIV/9b6NH/1x/8QEeowsb+jBEvFzU6z0T4mNsFwdPqXeFUM7uxC6QeSRkEbWu3qEB0VMjrvzN/0S9+w==",
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"win32",
+				"darwin",
+				"linux"
+			],
+			"dependencies": {
+				"onnxruntime-common": "~1.14.0"
+			}
+		},
+		"node_modules/onnxruntime-web": {
+			"version": "1.14.0",
+			"resolved": "https://registry.npmjs.org/onnxruntime-web/-/onnxruntime-web-1.14.0.tgz",
+			"integrity": "sha512-Kcqf43UMfW8mCydVGcX9OMXI2VN17c0p6XvR7IPSZzBf/6lteBzXHvcEVWDPmCKuGombl997HgLqj91F11DzXw==",
+			"license": "MIT",
+			"dependencies": {
+				"flatbuffers": "^1.12.0",
+				"guid-typescript": "^1.0.9",
+				"long": "^4.0.0",
+				"onnx-proto": "^4.0.4",
+				"onnxruntime-common": "~1.14.0",
+				"platform": "^1.3.6"
+			}
+		},
 		"node_modules/p-cancelable": {
 			"version": "2.1.1",
 			"resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-2.1.1.tgz",
@@ -8482,6 +8828,12 @@
 			"integrity": "sha512-mlsTRyGaPBjPedk6Bvw+aqbsXDtoAyAzm5MO7JgU+yVRyMQ5O8bD4Kcci7BS85f93veegeCPkL8R4GLClnjLFw==",
 			"license": "MIT"
 		},
+		"node_modules/platform": {
+			"version": "1.3.6",
+			"resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz",
+			"integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==",
+			"license": "MIT"
+		},
 		"node_modules/playwright": {
 			"version": "1.59.1",
 			"resolved": "https://registry.npmjs.org/playwright/-/playwright-1.59.1.tgz",
@@ -8725,6 +9077,91 @@
 				"node": "^12.20.0 || >=14"
 			}
 		},
+		"node_modules/prebuild-install": {
+			"version": "7.1.3",
+			"resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.3.tgz",
+			"integrity": "sha512-8Mf2cbV7x1cXPUILADGI3wuhfqWvtiLA1iclTDbFRZkgRQS0NqsPZphna9V+HyTEadheuPmjaJMsbzKQFOzLug==",
+			"deprecated": "No longer maintained. Please contact the author of the relevant native addon; alternatives are available.",
+			"license": "MIT",
+			"dependencies": {
+				"detect-libc": "^2.0.0",
+				"expand-template": "^2.0.3",
+				"github-from-package": "0.0.0",
+				"minimist": "^1.2.3",
+				"mkdirp-classic": "^0.5.3",
+				"napi-build-utils": "^2.0.0",
+				"node-abi": "^3.3.0",
+				"pump": "^3.0.0",
+				"rc": "^1.2.7",
+				"simple-get": "^4.0.0",
+				"tar-fs": "^2.0.0",
+				"tunnel-agent": "^0.6.0"
+			},
+			"bin": {
+				"prebuild-install": "bin.js"
+			},
+			"engines": {
+				"node": ">=10"
+			}
+		},
+		"node_modules/prebuild-install/node_modules/chownr": {
+			"version": "1.1.4",
+			"resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz",
+			"integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==",
+			"license": "ISC"
+		},
+		"node_modules/prebuild-install/node_modules/node-abi": {
+			"version": "3.92.0",
+			"resolved": "https://registry.npmjs.org/node-abi/-/node-abi-3.92.0.tgz",
+			"integrity": "sha512-KdHvFWZjEKDf0cakgFjebl371GPsISX2oZHcuyKqM7DtogIsHrqKeLTo8wBHxaXRAQlY2PsPlZmfo+9ZCxEREQ==",
+			"license": "MIT",
+			"dependencies": {
+				"semver": "^7.3.5"
+			},
+			"engines": {
+				"node": ">=10"
+			}
+		},
+		"node_modules/prebuild-install/node_modules/semver": {
+			"version": "7.8.0",
+			"resolved": "https://registry.npmjs.org/semver/-/semver-7.8.0.tgz",
+			"integrity": "sha512-AcM7dV/5ul4EekoQ29Agm5vri8JNqRyj39o0qpX6vDF2GZrtutZl5RwgD1XnZjiTAfncsJhMI48QQH3sN87YNA==",
+			"license": "ISC",
+			"bin": {
+				"semver": "bin/semver.js"
+			},
+			"engines": {
+				"node": ">=10"
+			}
+		},
+		"node_modules/prebuild-install/node_modules/tar-fs": {
+			"version": "2.1.4",
+			"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.4.tgz",
+			"integrity": "sha512-mDAjwmZdh7LTT6pNleZ05Yt65HC3E+NiQzl672vQG38jIrehtJk/J3mNwIg+vShQPcLF/LV7CMnDW6vjj6sfYQ==",
+			"license": "MIT",
+			"dependencies": {
+				"chownr": "^1.1.1",
+				"mkdirp-classic": "^0.5.2",
+				"pump": "^3.0.0",
+				"tar-stream": "^2.1.4"
+			}
+		},
+		"node_modules/prebuild-install/node_modules/tar-stream": {
+			"version": "2.2.0",
+			"resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz",
+			"integrity": "sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==",
+			"license": "MIT",
+			"dependencies": {
+				"bl": "^4.0.3",
+				"end-of-stream": "^1.4.1",
+				"fs-constants": "^1.0.0",
+				"inherits": "^2.0.3",
+				"readable-stream": "^3.1.1"
+			},
+			"engines": {
+				"node": ">=6"
+			}
+		},
 		"node_modules/pretty-format": {
 			"version": "27.5.1",
 			"resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-27.5.1.tgz",
@@ -8818,11 +9255,36 @@
 				"signal-exit": "^3.0.2"
 			}
 		},
+		"node_modules/protobufjs": {
+			"version": "6.11.6",
+			"resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-6.11.6.tgz",
+			"integrity": "sha512-k8BHqgPBOtrlougZZqF2uUk5Z7bN8f0wj+3e8M3hvtSv0NBAz4VBy5f6R5Nxq/l+i7mRFTgNZb2trxqTpHNY/A==",
+			"hasInstallScript": true,
+			"license": "BSD-3-Clause",
+			"dependencies": {
+				"@protobufjs/aspromise": "^1.1.2",
+				"@protobufjs/base64": "^1.1.2",
+				"@protobufjs/codegen": "^2.0.4",
+				"@protobufjs/eventemitter": "^1.1.0",
+				"@protobufjs/fetch": "^1.1.0",
+				"@protobufjs/float": "^1.0.2",
+				"@protobufjs/inquire": "^1.1.0",
+				"@protobufjs/path": "^1.1.2",
+				"@protobufjs/pool": "^1.1.0",
+				"@protobufjs/utf8": "^1.1.0",
+				"@types/long": "^4.0.1",
+				"@types/node": ">=13.7.0",
+				"long": "^4.0.0"
+			},
+			"bin": {
+				"pbjs": "bin/pbjs",
+				"pbts": "bin/pbts"
+			}
+		},
 		"node_modules/pump": {
 			"version": "3.0.4",
 			"resolved": "https://registry.npmjs.org/pump/-/pump-3.0.4.tgz",
 			"integrity": "sha512-VS7sjc6KR7e1ukRFhQSY5LM2uBWAUPiOPa/A3mkKmiMwSmRFUITt0xuj+/lesgnCv+dPIEYlkzrcyXgquIHMcA==",
-			"dev": true,
 			"license": "MIT",
 			"dependencies": {
 				"end-of-stream": "^1.1.0",
@@ -8905,6 +9367,21 @@
 				"url": "https://github.com/sponsors/sindresorhus"
 			}
 		},
+		"node_modules/rc": {
+			"version": "1.2.8",
+			"resolved": "https://registry.npmjs.org/rc/-/rc-1.2.8.tgz",
+			"integrity": "sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==",
+			"license": "(BSD-2-Clause OR MIT OR Apache-2.0)",
+			"dependencies": {
+				"deep-extend": "^0.6.0",
+				"ini": "~1.3.0",
+				"minimist": "^1.2.0",
+				"strip-json-comments": "~2.0.1"
+			},
+			"bin": {
+				"rc": "cli.js"
+			}
+		},
 		"node_modules/re-resizable": {
 			"version": "6.11.2",
 			"resolved": "https://registry.npmjs.org/re-resizable/-/re-resizable-6.11.2.tgz",
@@ -9103,6 +9580,20 @@
 				"pify": "^2.3.0"
 			}
 		},
+		"node_modules/readable-stream": {
+			"version": "3.6.2",
+			"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz",
+			"integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==",
+			"license": "MIT",
+			"dependencies": {
+				"inherits": "^2.0.3",
+				"string_decoder": "^1.1.1",
+				"util-deprecate": "^1.0.1"
+			},
+			"engines": {
+				"node": ">= 6"
+			}
+		},
 		"node_modules/readdirp": {
 			"version": "3.6.0",
 			"resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz",
@@ -9385,6 +9876,26 @@
 				"queue-microtask": "^1.2.2"
 			}
 		},
+		"node_modules/safe-buffer": {
+			"version": "5.2.1",
+			"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
+			"integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==",
+			"funding": [
+				{
+					"type": "github",
+					"url": "https://github.com/sponsors/feross"
+				},
+				{
+					"type": "patreon",
+					"url": "https://www.patreon.com/feross"
+				},
+				{
+					"type": "consulting",
+					"url": "https://feross.org/support"
+				}
+			],
+			"license": "MIT"
+		},
 		"node_modules/safer-buffer": {
 			"version": "2.1.2",
 			"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
@@ -9469,6 +9980,47 @@
 				"url": "https://github.com/sponsors/sindresorhus"
 			}
 		},
+		"node_modules/sharp": {
+			"version": "0.32.6",
+			"resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.6.tgz",
+			"integrity": "sha512-KyLTWwgcR9Oe4d9HwCwNM2l7+J0dUQwn/yf7S0EnTtb0eVS4RxO0eUSvxPtzT4F3SY+C4K6fqdv/DO27sJ/v/w==",
+			"hasInstallScript": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"color": "^4.2.3",
+				"detect-libc": "^2.0.2",
+				"node-addon-api": "^6.1.0",
+				"prebuild-install": "^7.1.1",
+				"semver": "^7.5.4",
+				"simple-get": "^4.0.1",
+				"tar-fs": "^3.0.4",
+				"tunnel-agent": "^0.6.0"
+			},
+			"engines": {
+				"node": ">=14.15.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/libvips"
+			}
+		},
+		"node_modules/sharp/node_modules/node-addon-api": {
+			"version": "6.1.0",
+			"resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-6.1.0.tgz",
+			"integrity": "sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA==",
+			"license": "MIT"
+		},
+		"node_modules/sharp/node_modules/semver": {
+			"version": "7.8.0",
+			"resolved": "https://registry.npmjs.org/semver/-/semver-7.8.0.tgz",
+			"integrity": "sha512-AcM7dV/5ul4EekoQ29Agm5vri8JNqRyj39o0qpX6vDF2GZrtutZl5RwgD1XnZjiTAfncsJhMI48QQH3sN87YNA==",
+			"license": "ISC",
+			"bin": {
+				"semver": "bin/semver.js"
+			},
+			"engines": {
+				"node": ">=10"
+			}
+		},
 		"node_modules/shebang-command": {
 			"version": "2.0.0",
 			"resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz",
@@ -9582,6 +10134,60 @@
 			"dev": true,
 			"license": "ISC"
 		},
+		"node_modules/simple-concat": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz",
+			"integrity": "sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==",
+			"funding": [
+				{
+					"type": "github",
+					"url": "https://github.com/sponsors/feross"
+				},
+				{
+					"type": "patreon",
+					"url": "https://www.patreon.com/feross"
+				},
+				{
+					"type": "consulting",
+					"url": "https://feross.org/support"
+				}
+			],
+			"license": "MIT"
+		},
+		"node_modules/simple-get": {
+			"version": "4.0.1",
+			"resolved": "https://registry.npmjs.org/simple-get/-/simple-get-4.0.1.tgz",
+			"integrity": "sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA==",
+			"funding": [
+				{
+					"type": "github",
+					"url": "https://github.com/sponsors/feross"
+				},
+				{
+					"type": "patreon",
+					"url": "https://www.patreon.com/feross"
+				},
+				{
+					"type": "consulting",
+					"url": "https://feross.org/support"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"decompress-response": "^6.0.0",
+				"once": "^1.3.1",
+				"simple-concat": "^1.0.0"
+			}
+		},
+		"node_modules/simple-swizzle": {
+			"version": "0.2.4",
+			"resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.4.tgz",
+			"integrity": "sha512-nAu1WFPQSMNr2Zn9PGSZK9AGn4t/y97lEm+MXTtUDwfP0ksAIX4nO+6ruD9Jwut4C49SB1Ws+fbXsm/yScWOHw==",
+			"license": "MIT",
+			"dependencies": {
+				"is-arrayish": "^0.3.1"
+			}
+		},
 		"node_modules/simple-update-notifier": {
 			"version": "2.0.0",
 			"resolved": "https://registry.npmjs.org/simple-update-notifier/-/simple-update-notifier-2.0.0.tgz",
@@ -9723,6 +10329,26 @@
 			"dev": true,
 			"license": "MIT"
 		},
+		"node_modules/streamx": {
+			"version": "2.25.0",
+			"resolved": "https://registry.npmjs.org/streamx/-/streamx-2.25.0.tgz",
+			"integrity": "sha512-0nQuG6jf1w+wddNEEXCF4nTg3LtufWINB5eFEN+5TNZW7KWJp6x87+JFL43vaAUPyCfH1wID+mNVyW6OHtFamg==",
+			"license": "MIT",
+			"dependencies": {
+				"events-universal": "^1.0.0",
+				"fast-fifo": "^1.3.2",
+				"text-decoder": "^1.1.0"
+			}
+		},
+		"node_modules/string_decoder": {
+			"version": "1.3.0",
+			"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz",
+			"integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==",
+			"license": "MIT",
+			"dependencies": {
+				"safe-buffer": "~5.2.0"
+			}
+		},
 		"node_modules/string-argv": {
 			"version": "0.3.2",
 			"resolved": "https://registry.npmjs.org/string-argv/-/string-argv-0.3.2.tgz",
@@ -9803,6 +10429,15 @@
 				"node": ">=8"
 			}
 		},
+		"node_modules/strip-json-comments": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz",
+			"integrity": "sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==",
+			"license": "MIT",
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
 		"node_modules/sucrase": {
 			"version": "3.35.1",
 			"resolved": "https://registry.npmjs.org/sucrase/-/sucrase-3.35.1.tgz",
@@ -9961,6 +10596,46 @@
 				"node": ">=18"
 			}
 		},
+		"node_modules/tar-fs": {
+			"version": "3.1.2",
+			"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.1.2.tgz",
+			"integrity": "sha512-QGxxTxxyleAdyM3kpFs14ymbYmNFrfY+pHj7Z8FgtbZ7w2//VAgLMac7sT6nRpIHjppXO2AwwEOg0bPFVRcmXw==",
+			"license": "MIT",
+			"dependencies": {
+				"pump": "^3.0.0",
+				"tar-stream": "^3.1.5"
+			},
+			"optionalDependencies": {
+				"bare-fs": "^4.0.1",
+				"bare-path": "^3.0.0"
+			}
+		},
+		"node_modules/tar-stream": {
+			"version": "3.2.0",
+			"resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-3.2.0.tgz",
+			"integrity": "sha512-ojzvCvVaNp6aOTFmG7jaRD0meowIAuPc3cMMhSgKiVWws1GyHbGd/xvnyuRKcKlMpt3qvxx6r0hreCNITP9hIg==",
+			"license": "MIT",
+			"dependencies": {
+				"b4a": "^1.6.4",
+				"bare-fs": "^4.5.5",
+				"fast-fifo": "^1.2.0",
+				"streamx": "^2.15.0"
+			}
+		},
+		"node_modules/tar-stream/node_modules/b4a": {
+			"version": "1.8.1",
+			"resolved": "https://registry.npmjs.org/b4a/-/b4a-1.8.1.tgz",
+			"integrity": "sha512-aiqre1Nr0B/6DgE2N5vwTc+2/oQZ4Wh1t4NznYY4E00y8LCt6NqdRv81so00oo27D8MVKTpUa/MwUUtBLXCoDw==",
+			"license": "Apache-2.0",
+			"peerDependencies": {
+				"react-native-b4a": "*"
+			},
+			"peerDependenciesMeta": {
+				"react-native-b4a": {
+					"optional": true
+				}
+			}
+		},
 		"node_modules/tar/node_modules/yallist": {
 			"version": "5.0.0",
 			"resolved": "https://registry.npmjs.org/yallist/-/yallist-5.0.0.tgz",
@@ -9971,6 +10646,15 @@
 				"node": ">=18"
 			}
 		},
+		"node_modules/teex": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/teex/-/teex-1.0.1.tgz",
+			"integrity": "sha512-eYE6iEI62Ni1H8oIa7KlDU6uQBtqr4Eajni3wX7rpfXD8ysFx8z0+dri+KWEPWpBsxXfxu58x/0jvTVT1ekOSg==",
+			"license": "MIT",
+			"dependencies": {
+				"streamx": "^2.12.5"
+			}
+		},
 		"node_modules/temp": {
 			"version": "0.9.4",
 			"resolved": "https://registry.npmjs.org/temp/-/temp-0.9.4.tgz",
@@ -10061,6 +10745,29 @@
 			"dev": true,
 			"license": "MIT"
 		},
+		"node_modules/text-decoder": {
+			"version": "1.2.7",
+			"resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.2.7.tgz",
+			"integrity": "sha512-vlLytXkeP4xvEq2otHeJfSQIRyWxo/oZGEbXrtEEF9Hnmrdly59sUbzZ/QgyWuLYHctCHxFF4tRQZNQ9k60ExQ==",
+			"license": "Apache-2.0",
+			"dependencies": {
+				"b4a": "^1.6.4"
+			}
+		},
+		"node_modules/text-decoder/node_modules/b4a": {
+			"version": "1.8.1",
+			"resolved": "https://registry.npmjs.org/b4a/-/b4a-1.8.1.tgz",
+			"integrity": "sha512-aiqre1Nr0B/6DgE2N5vwTc+2/oQZ4Wh1t4NznYY4E00y8LCt6NqdRv81so00oo27D8MVKTpUa/MwUUtBLXCoDw==",
+			"license": "Apache-2.0",
+			"peerDependencies": {
+				"react-native-b4a": "*"
+			},
+			"peerDependenciesMeta": {
+				"react-native-b4a": {
+					"optional": true
+				}
+			}
+		},
 		"node_modules/thenify": {
 			"version": "3.3.1",
 			"resolved": "https://registry.npmjs.org/thenify/-/thenify-3.3.1.tgz",
@@ -10264,6 +10971,18 @@
 			"integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
 			"license": "0BSD"
 		},
+		"node_modules/tunnel-agent": {
+			"version": "0.6.0",
+			"resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz",
+			"integrity": "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==",
+			"license": "Apache-2.0",
+			"dependencies": {
+				"safe-buffer": "^5.0.1"
+			},
+			"engines": {
+				"node": "*"
+			}
+		},
 		"node_modules/type-fest": {
 			"version": "0.13.1",
 			"resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.13.1.tgz",
@@ -10319,7 +11038,6 @@
 			"version": "6.21.0",
 			"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
 			"integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
-			"dev": true,
 			"license": "MIT"
 		},
 		"node_modules/universalify": {
@@ -10831,7 +11549,6 @@
 			"version": "1.0.2",
 			"resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
 			"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
-			"dev": true,
 			"license": "ISC"
 		},
 		"node_modules/ws": {
diff --git a/package.json b/package.json
index 2ccb0b32d..d7ba76362 100644
--- a/package.json
+++ b/package.json
@@ -52,6 +52,7 @@
 		"@uiw/color-convert": "^2.10.1",
 		"@uiw/react-color-block": "^2.10.1",
 		"@uiw/react-color-colorful": "^2.9.2",
+		"@xenova/transformers": "^2.17.2",
 		"class-variance-authority": "^0.7.1",
 		"clsx": "^2.1.1",
 		"dnd-timeline": "^2.4.0",
diff --git a/src/components/ui/select.tsx b/src/components/ui/select.tsx
index d151d164e..bdbf64e9a 100644
--- a/src/components/ui/select.tsx
+++ b/src/components/ui/select.tsx
@@ -82,7 +82,8 @@ const SelectContent = React.forwardRef<
 			<SelectPrimitive.Content
 				ref={ref}
 				className={cn(
-					"relative z-50 max-h-96 min-w-[8rem] overflow-hidden rounded-md border bg-popover text-popover-foreground shadow-md data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2",
+					/* Above Dialog (z-[10000]) and fullscreen overlays (e.g. z-[99999]) */
+					"relative z-[100000] max-h-96 min-w-[8rem] overflow-hidden rounded-md border bg-popover text-popover-foreground shadow-md data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2",
 					position === "popper" &&
 						"data-[side=bottom]:translate-y-1 data-[side=left]:-translate-x-1 data-[side=right]:translate-x-1",
 					className,
diff --git a/src/components/video-editor/VideoEditor.tsx b/src/components/video-editor/VideoEditor.tsx
index fd2188818..86ee26ba0 100644
--- a/src/components/video-editor/VideoEditor.tsx
+++ b/src/components/video-editor/VideoEditor.tsx
@@ -1,8 +1,10 @@
 import type { Span } from "dnd-timeline";
 import { FolderOpen, Languages, Save, Video } from "lucide-react";
-import { useCallback, useEffect, useMemo, useRef, useState } from "react";
+import { type CSSProperties, useCallback, useEffect, useMemo, useRef, useState } from "react";
+import { MdClosedCaption } from "react-icons/md";
 import { Panel, PanelGroup, PanelResizeHandle } from "react-resizable-panels";
 import { toast } from "sonner";
+import { Button } from "@/components/ui/button";
 import {
 	Dialog,
 	DialogContent,
@@ -11,11 +13,28 @@ import {
 	DialogHeader,
 	DialogTitle,
 } from "@/components/ui/dialog";
+import { Label } from "@/components/ui/label";
+import {
+	Select,
+	SelectContent,
+	SelectItem,
+	SelectTrigger,
+	SelectValue,
+} from "@/components/ui/select";
 import { useI18n, useScopedT } from "@/contexts/I18nContext";
 import { useShortcuts } from "@/contexts/ShortcutsContext";
 import { INITIAL_EDITOR_STATE, useEditorHistory } from "@/hooks/useEditorHistory";
 import { type Locale } from "@/i18n/config";
 import { getAvailableLocales, getLocaleName } from "@/i18n/loader";
+import {
+	captionSegmentsToAnnotationRegions,
+	extractMono16kFromVideoUrl,
+	MAX_CAPTION_AUDIO_SEC,
+	reconcileAutoCaptionTimelineGaps,
+	shiftTrimRegionsMsForCaptionBuffer,
+	transcribeMono16kToSegments,
+	trimLeadingSilenceMono16k,
+} from "@/lib/captioning";
 import {
 	calculateOutputDimensions,
 	type ExportFormat,
@@ -77,6 +96,8 @@ import {
 } from "./types";
 import VideoPlayback, { VideoPlaybackRef } from "./VideoPlayback";
 
+const CAPTION_WORD_CHOICES = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] as const;
+
 export default function VideoEditor() {
 	const {
 		state: editorState,
@@ -169,6 +190,11 @@ export default function VideoEditor() {
 
 	const nextAnnotationIdRef = useRef(1);
 	const nextAnnotationZIndexRef = useRef(1);
+	const isAutoCaptioningRef = useRef(false);
+	const [isAutoCaptioning, setIsAutoCaptioning] = useState(false);
+	const [showAutoCaptionsDialog, setShowAutoCaptionsDialog] = useState(false);
+	const [captionWordsMin, setCaptionWordsMin] = useState(2);
+	const [captionWordsMax, setCaptionWordsMax] = useState(7);
 	const exporterRef = useRef<VideoExporter | null>(null);
 
 	const annotationOnlyRegions = useMemo(
@@ -995,8 +1021,8 @@ export default function VideoEditor() {
 
 	const handleAnnotationSpanChange = useCallback(
 		(id: string, span: Span) => {
-			pushState((prev) => ({
-				annotationRegions: prev.annotationRegions.map((region) =>
+			pushState((prev) => {
+				const next = prev.annotationRegions.map((region) =>
 					region.id === id
 						? {
 								...region,
@@ -1004,8 +1030,11 @@ export default function VideoEditor() {
 								endMs: Math.round(span.end),
 							}
 						: region,
-				),
-			}));
+				);
+				return {
+					annotationRegions: reconcileAutoCaptionTimelineGaps(next),
+				};
+			});
 		},
 		[pushState],
 	);
@@ -1018,8 +1047,10 @@ export default function VideoEditor() {
 				const source = prev.annotationRegions.find((region) => region.id === id);
 				if (!source) return {};
 
+				const { annotationSource: _stripCaptionLink, ...sourceWithoutCaptionLink } = source;
+
 				const duplicate: AnnotationRegion = {
-					...source,
+					...sourceWithoutCaptionLink,
 					id: duplicateId,
 					zIndex: duplicateZIndex,
 					position: { x: source.position.x + 4, y: source.position.y + 4 },
@@ -1108,11 +1139,18 @@ export default function VideoEditor() {
 
 	const handleAnnotationStyleChange = useCallback(
 		(id: string, style: Partial<AnnotationRegion["style"]>) => {
-			pushState((prev) => ({
-				annotationRegions: prev.annotationRegions.map((region) =>
-					region.id === id ? { ...region, style: { ...region.style, ...style } } : region,
-				),
-			}));
+			pushState((prev) => {
+				const touched = prev.annotationRegions.find((r) => r.id === id);
+				const syncAutoCaptions = touched?.annotationSource === "auto-caption";
+				return {
+					annotationRegions: prev.annotationRegions.map((region) => {
+						if (syncAutoCaptions && region.annotationSource === "auto-caption") {
+							return { ...region, style: { ...region.style, ...style } };
+						}
+						return region.id === id ? { ...region, style: { ...region.style, ...style } } : region;
+					}),
+				};
+			});
 		},
 		[pushState],
 	);
@@ -1175,22 +1213,36 @@ export default function VideoEditor() {
 
 	const handleAnnotationPositionChange = useCallback(
 		(id: string, position: { x: number; y: number }) => {
-			pushState((prev) => ({
-				annotationRegions: prev.annotationRegions.map((region) =>
-					region.id === id ? { ...region, position } : region,
-				),
-			}));
+			pushState((prev) => {
+				const moved = prev.annotationRegions.find((r) => r.id === id);
+				const syncAutoCaptions = moved?.annotationSource === "auto-caption";
+				return {
+					annotationRegions: prev.annotationRegions.map((region) => {
+						if (syncAutoCaptions && region.annotationSource === "auto-caption") {
+							return { ...region, position };
+						}
+						return region.id === id ? { ...region, position } : region;
+					}),
+				};
+			});
 		},
 		[pushState],
 	);
 
 	const handleAnnotationSizeChange = useCallback(
 		(id: string, size: { width: number; height: number }) => {
-			pushState((prev) => ({
-				annotationRegions: prev.annotationRegions.map((region) =>
-					region.id === id ? { ...region, size } : region,
-				),
-			}));
+			pushState((prev) => {
+				const resized = prev.annotationRegions.find((r) => r.id === id);
+				const syncAutoCaptions = resized?.annotationSource === "auto-caption";
+				return {
+					annotationRegions: prev.annotationRegions.map((region) => {
+						if (syncAutoCaptions && region.annotationSource === "auto-caption") {
+							return { ...region, size };
+						}
+						return region.id === id ? { ...region, size } : region;
+					}),
+				};
+			});
 		},
 		[pushState],
 	);
@@ -1730,6 +1782,104 @@ export default function VideoEditor() {
 		}
 	}, []);
 
+	const generateAutoCaptions = useCallback(
+		async (minWords: number, maxWords: number) => {
+			if (!videoPath) {
+				toast.error(t("errors.noVideoLoaded"));
+				return;
+			}
+			if (isAutoCaptioningRef.current) {
+				toast.error(t("autoCaptions.busy"));
+				return;
+			}
+			const minW = Math.max(1, Math.min(minWords, maxWords));
+			const maxW = Math.max(minW, maxWords);
+
+			isAutoCaptioningRef.current = true;
+			setIsAutoCaptioning(true);
+			const toastId = toast.loading(t("autoCaptions.generating"));
+			try {
+				const { samples, truncated, durationSec } = await extractMono16kFromVideoUrl(videoPath);
+				if (!Number.isFinite(durationSec) || durationSec <= 0 || samples.length < 800) {
+					toast.dismiss(toastId);
+					toast.error(t("autoCaptions.noAudio"));
+					return;
+				}
+
+				const { samples: speechSamples, trimSec } = trimLeadingSilenceMono16k(samples);
+				if (speechSamples.length < 800) {
+					toast.dismiss(toastId);
+					toast.error(t("autoCaptions.noAudio"));
+					return;
+				}
+
+				const trimMs = Math.round(trimSec * 1000);
+				const trimRegionsForTranscribe = shiftTrimRegionsMsForCaptionBuffer(trimRegions, trimMs);
+
+				const { segments: segmentsRaw, granularity } = await transcribeMono16kToSegments(
+					speechSamples,
+					{
+						trimRegions: trimRegionsForTranscribe,
+						onStatus: (phase) => {
+							if (phase === "model") {
+								toast.loading(t("autoCaptions.loadingModel"), { id: toastId });
+							}
+						},
+					},
+				);
+
+				const segments =
+					trimSec > 0
+						? segmentsRaw.map((s) => ({
+								...s,
+								startSec: s.startSec + trimSec,
+								endSec: s.endSec + trimSec,
+							}))
+						: segmentsRaw;
+
+				const { regions, nextNumericId, nextZIndex } = captionSegmentsToAnnotationRegions(
+					segments,
+					nextAnnotationIdRef.current,
+					nextAnnotationZIndexRef.current,
+					{
+						minWordsPerCaption: minW,
+						maxWordsPerCaption: maxW,
+						timestampGranularity: granularity,
+					},
+				);
+
+				if (regions.length === 0) {
+					toast.dismiss(toastId);
+					toast.info(t("autoCaptions.noneHeard"));
+					return;
+				}
+
+				pushState((prev) => ({ annotationRegions: [...prev.annotationRegions, ...regions] }));
+				nextAnnotationIdRef.current = nextNumericId;
+				nextAnnotationZIndexRef.current = nextZIndex;
+
+				toast.dismiss(toastId);
+				if (truncated) {
+					toast.info(
+						t("autoCaptions.truncated", {
+							minutes: String(Math.round(MAX_CAPTION_AUDIO_SEC / 60)),
+						}),
+					);
+				}
+				toast.success(t("autoCaptions.done", { count: String(regions.length) }));
+			} catch (e) {
+				console.error(e);
+				toast.dismiss(toastId);
+				const detail = e instanceof Error ? e.message : String(e);
+				toast.error(t("autoCaptions.failed"), { description: detail });
+			} finally {
+				isAutoCaptioningRef.current = false;
+				setIsAutoCaptioning(false);
+			}
+		},
+		[videoPath, trimRegions, pushState, t],
+	);
+
 	if (loading) {
 		return (
 			<div className="flex items-center justify-center h-screen bg-background">
@@ -1759,7 +1909,7 @@ export default function VideoEditor() {
 			<Dialog open={showNewRecordingDialog} onOpenChange={setShowNewRecordingDialog}>
 				<DialogContent
 					className="sm:max-w-[425px]"
-					style={{ WebkitAppRegion: "no-drag" } as React.CSSProperties}
+					style={{ WebkitAppRegion: "no-drag" } as CSSProperties}
 				>
 					<DialogHeader>
 						<DialogTitle>{t("newRecording.title")}</DialogTitle>
@@ -1784,13 +1934,92 @@ export default function VideoEditor() {
 				</DialogContent>
 			</Dialog>
 
+			<Dialog open={showAutoCaptionsDialog} onOpenChange={setShowAutoCaptionsDialog}>
+				<DialogContent
+					className="sm:max-w-md"
+					style={{ WebkitAppRegion: "no-drag" } as CSSProperties}
+				>
+					<DialogHeader>
+						<DialogTitle>{t("autoCaptions.dialogTitle")}</DialogTitle>
+						<DialogDescription>{t("autoCaptions.dialogDescription")}</DialogDescription>
+					</DialogHeader>
+					<div className="grid gap-4 py-2">
+						<div className="grid gap-2">
+							<Label htmlFor="caption-min-words">{t("autoCaptions.minWords")}</Label>
+							<Select
+								value={String(captionWordsMin)}
+								onValueChange={(v) => {
+									const n = Number.parseInt(v, 10);
+									setCaptionWordsMin(n);
+									if (n > captionWordsMax) setCaptionWordsMax(n);
+								}}
+							>
+								<SelectTrigger id="caption-min-words" className="h-9">
+									<SelectValue />
+								</SelectTrigger>
+								<SelectContent>
+									{CAPTION_WORD_CHOICES.map((n) => (
+										<SelectItem key={`min-${n}`} value={String(n)}>
+											{t("autoCaptions.wordsCount", { count: String(n) })}
+										</SelectItem>
+									))}
+								</SelectContent>
+							</Select>
+						</div>
+						<div className="grid gap-2">
+							<Label htmlFor="caption-max-words">{t("autoCaptions.maxWords")}</Label>
+							<Select
+								value={String(captionWordsMax)}
+								onValueChange={(v) => {
+									const n = Number.parseInt(v, 10);
+									setCaptionWordsMax(n);
+									if (n < captionWordsMin) setCaptionWordsMin(n);
+								}}
+							>
+								<SelectTrigger id="caption-max-words" className="h-9">
+									<SelectValue />
+								</SelectTrigger>
+								<SelectContent>
+									{CAPTION_WORD_CHOICES.map((n) => (
+										<SelectItem key={`max-${n}`} value={String(n)}>
+											{t("autoCaptions.wordsCount", { count: String(n) })}
+										</SelectItem>
+									))}
+								</SelectContent>
+							</Select>
+						</div>
+					</div>
+					<DialogFooter className="gap-2 sm:gap-0">
+						<Button
+							type="button"
+							variant="outline"
+							onClick={() => setShowAutoCaptionsDialog(false)}
+							className="border-white/20 bg-transparent text-white hover:bg-white/10"
+						>
+							{t("autoCaptions.dialogCancel")}
+						</Button>
+						<Button
+							type="button"
+							disabled={isAutoCaptioning}
+							onClick={() => {
+								setShowAutoCaptionsDialog(false);
+								void generateAutoCaptions(captionWordsMin, captionWordsMax);
+							}}
+							className="bg-[#34B27B] text-white hover:bg-[#34B27B]/90"
+						>
+							{t("autoCaptions.generate")}
+						</Button>
+					</DialogFooter>
+				</DialogContent>
+			</Dialog>
+
 			<div
 				className="h-10 flex-shrink-0 bg-[#09090b]/80 backdrop-blur-md border-b border-white/5 flex items-center justify-between px-6 z-50"
-				style={{ WebkitAppRegion: "drag" } as React.CSSProperties}
+				style={{ WebkitAppRegion: "drag" } as CSSProperties}
 			>
 				<div
 					className="flex-1 flex items-center gap-1"
-					style={{ WebkitAppRegion: "no-drag" } as React.CSSProperties}
+					style={{ WebkitAppRegion: "no-drag" } as CSSProperties}
 				>
 					<div
 						className={`flex items-center gap-1 px-2 py-1 rounded-md text-white/50 hover:text-white/90 hover:bg-white/10 transition-all duration-150 ${isMac ? "ml-14" : "ml-2"}`}
@@ -1833,6 +2062,28 @@ export default function VideoEditor() {
 						<Save size={14} />
 						{ts("project.save")}
 					</button>
+					<Button
+						type="button"
+						variant="ghost"
+						size="sm"
+						disabled={isAutoCaptioning || !videoPath}
+						onClick={() => {
+							if (!videoPath) {
+								toast.error(t("errors.noVideoLoaded"));
+								return;
+							}
+							if (isAutoCaptioningRef.current) {
+								toast.error(t("autoCaptions.busy"));
+								return;
+							}
+							setShowAutoCaptionsDialog(true);
+						}}
+						className="h-7 px-2 text-white/50 hover:text-white/90 hover:bg-white/10 text-[11px] font-medium gap-1"
+						style={{ WebkitAppRegion: "no-drag" } as CSSProperties}
+					>
+						<MdClosedCaption className="size-3.5 shrink-0" aria-hidden />
+						{t("autoCaptions.button")}
+					</Button>
 				</div>
 			</div>
 
diff --git a/src/components/video-editor/projectPersistence.ts b/src/components/video-editor/projectPersistence.ts
index 8d134282d..b1ebd0b55 100644
--- a/src/components/video-editor/projectPersistence.ts
+++ b/src/components/video-editor/projectPersistence.ts
@@ -342,6 +342,8 @@ export function normalizeProjectEditor(editor: Partial<ProjectEditorState>): Pro
 						content: typeof region.content === "string" ? region.content : "",
 						textContent: typeof region.textContent === "string" ? region.textContent : undefined,
 						imageContent: typeof region.imageContent === "string" ? region.imageContent : undefined,
+						annotationSource:
+							region.annotationSource === "auto-caption" ? ("auto-caption" as const) : undefined,
 						position: {
 							x: clamp(
 								isFiniteNumber(region.position?.x)
diff --git a/src/components/video-editor/types.ts b/src/components/video-editor/types.ts
index f976efc8c..adf580942 100644
--- a/src/components/video-editor/types.ts
+++ b/src/components/video-editor/types.ts
@@ -248,6 +248,8 @@ export interface AnnotationRegion {
 	size: AnnotationSize;
 	style: AnnotationTextStyle;
 	zIndex: number;
+	/** When set, layout/style edits on one region can sync to all auto-caption siblings. */
+	annotationSource?: "auto-caption";
 	figureData?: FigureData;
 	blurData?: BlurData;
 }
diff --git a/src/i18n/locales/ar/editor.json b/src/i18n/locales/ar/editor.json
index a246f011a..d0f99ed60 100644
--- a/src/i18n/locales/ar/editor.json
+++ b/src/i18n/locales/ar/editor.json
@@ -41,5 +41,23 @@
 		"cameraDisconnected": "تم فصل كاميرا الويب.",
 		"cameraNotFound": "لم يتم العثور على كاميرا.",
 		"permissionDenied": "تم رفض إذن التسجيل. يرجى السماح بتسجيل الشاشة."
+	},
+	"autoCaptions": {
+		"button": "Auto captions",
+		"dialogTitle": "Auto captions",
+		"dialogDescription": "Choose roughly how many words each caption shows at once. Timing is spread across the words in that phrase.",
+		"minWords": "Minimum words per caption",
+		"maxWords": "Maximum words per caption",
+		"wordsCount": "{{count}} words",
+		"generate": "Generate",
+		"dialogCancel": "Cancel",
+		"generating": "Generating captions from audio…",
+		"loadingModel": "Loading speech model (first use downloads ~75 MB)…",
+		"busy": "Caption generation is already in progress.",
+		"done": "Added {{count}} captions.",
+		"noneHeard": "No speech was detected.",
+		"noAudio": "This video has no usable audio to transcribe.",
+		"failed": "Could not generate captions.",
+		"truncated": "Only the first {{minutes}} minutes were transcribed."
 	}
 }
diff --git a/src/i18n/locales/en/editor.json b/src/i18n/locales/en/editor.json
index 13e2e1397..ce62aeddc 100644
--- a/src/i18n/locales/en/editor.json
+++ b/src/i18n/locales/en/editor.json
@@ -41,5 +41,23 @@
 		"cameraDisconnected": "Webcam disconnected.",
 		"cameraNotFound": "Camera not found.",
 		"permissionDenied": "Recording permission denied. Please allow screen recording."
+	},
+	"autoCaptions": {
+		"button": "Auto captions",
+		"dialogTitle": "Auto captions",
+		"dialogDescription": "Choose roughly how many words each caption shows at once. Timing is spread across the words in that phrase.",
+		"minWords": "Minimum words per caption",
+		"maxWords": "Maximum words per caption",
+		"wordsCount": "{{count}} words",
+		"generate": "Generate",
+		"dialogCancel": "Cancel",
+		"generating": "Generating captions from audio…",
+		"loadingModel": "Loading speech model (first use downloads ~75 MB)…",
+		"busy": "Caption generation is already in progress.",
+		"done": "Added {{count}} captions.",
+		"noneHeard": "No speech was detected.",
+		"noAudio": "This video has no usable audio to transcribe.",
+		"failed": "Could not generate captions.",
+		"truncated": "Only the first {{minutes}} minutes were transcribed."
 	}
 }
diff --git a/src/i18n/locales/es/editor.json b/src/i18n/locales/es/editor.json
index 8f6ad13e7..64faff9e7 100644
--- a/src/i18n/locales/es/editor.json
+++ b/src/i18n/locales/es/editor.json
@@ -41,5 +41,23 @@
 		"description": "Tu sesión actual ha sido guardada.",
 		"cancel": "Cancelar",
 		"confirm": "Confirmar"
+	},
+	"autoCaptions": {
+		"button": "Auto captions",
+		"dialogTitle": "Auto captions",
+		"dialogDescription": "Choose roughly how many words each caption shows at once. Timing is spread across the words in that phrase.",
+		"minWords": "Minimum words per caption",
+		"maxWords": "Maximum words per caption",
+		"wordsCount": "{{count}} words",
+		"generate": "Generate",
+		"dialogCancel": "Cancel",
+		"generating": "Generating captions from audio…",
+		"loadingModel": "Loading speech model (first use downloads ~75 MB)…",
+		"busy": "Caption generation is already in progress.",
+		"done": "Added {{count}} captions.",
+		"noneHeard": "No speech was detected.",
+		"noAudio": "This video has no usable audio to transcribe.",
+		"failed": "Could not generate captions.",
+		"truncated": "Only the first {{minutes}} minutes were transcribed."
 	}
 }
diff --git a/src/i18n/locales/fr/editor.json b/src/i18n/locales/fr/editor.json
index 6380c6b3d..4137a7035 100644
--- a/src/i18n/locales/fr/editor.json
+++ b/src/i18n/locales/fr/editor.json
@@ -41,5 +41,23 @@
 		"cameraNotFound": "Caméra introuvable.",
 		"permissionDenied": "Permission d'enregistrement refusée. Veuillez autoriser l'enregistrement d'écran."
 	},
-	"loadingVideo": "Chargement de la vidéo..."
+	"loadingVideo": "Chargement de la vidéo...",
+	"autoCaptions": {
+		"button": "Auto captions",
+		"dialogTitle": "Auto captions",
+		"dialogDescription": "Choose roughly how many words each caption shows at once. Timing is spread across the words in that phrase.",
+		"minWords": "Minimum words per caption",
+		"maxWords": "Maximum words per caption",
+		"wordsCount": "{{count}} words",
+		"generate": "Generate",
+		"dialogCancel": "Cancel",
+		"generating": "Generating captions from audio…",
+		"loadingModel": "Loading speech model (first use downloads ~75 MB)…",
+		"busy": "Caption generation is already in progress.",
+		"done": "Added {{count}} captions.",
+		"noneHeard": "No speech was detected.",
+		"noAudio": "This video has no usable audio to transcribe.",
+		"failed": "Could not generate captions.",
+		"truncated": "Only the first {{minutes}} minutes were transcribed."
+	}
 }
diff --git a/src/i18n/locales/ja-JP/editor.json b/src/i18n/locales/ja-JP/editor.json
index 051335f30..e488dadc8 100644
--- a/src/i18n/locales/ja-JP/editor.json
+++ b/src/i18n/locales/ja-JP/editor.json
@@ -41,5 +41,23 @@
 		"permissionDenied": "録画の権限が拒否されました。画面録画を許可してください。",
 		"cameraDisconnected": "ウェブカメラが切断されました。",
 		"cameraNotFound": "カメラが見つかりません。"
+	},
+	"autoCaptions": {
+		"button": "Auto captions",
+		"dialogTitle": "Auto captions",
+		"dialogDescription": "Choose roughly how many words each caption shows at once. Timing is spread across the words in that phrase.",
+		"minWords": "Minimum words per caption",
+		"maxWords": "Maximum words per caption",
+		"wordsCount": "{{count}} words",
+		"generate": "Generate",
+		"dialogCancel": "Cancel",
+		"generating": "Generating captions from audio…",
+		"loadingModel": "Loading speech model (first use downloads ~75 MB)…",
+		"busy": "Caption generation is already in progress.",
+		"done": "Added {{count}} captions.",
+		"noneHeard": "No speech was detected.",
+		"noAudio": "This video has no usable audio to transcribe.",
+		"failed": "Could not generate captions.",
+		"truncated": "Only the first {{minutes}} minutes were transcribed."
 	}
 }
diff --git a/src/i18n/locales/ko-KR/editor.json b/src/i18n/locales/ko-KR/editor.json
index ce1624476..2c57b9c13 100644
--- a/src/i18n/locales/ko-KR/editor.json
+++ b/src/i18n/locales/ko-KR/editor.json
@@ -41,5 +41,23 @@
 		"permissionDenied": "녹화 권한이 거부되었습니다. 화면 녹화를 허용해 주세요.",
 		"cameraDisconnected": "웹캠 연결이 끊어졌습니다.",
 		"cameraNotFound": "카메라를 찾을 수 없습니다."
+	},
+	"autoCaptions": {
+		"button": "Auto captions",
+		"dialogTitle": "Auto captions",
+		"dialogDescription": "Choose roughly how many words each caption shows at once. Timing is spread across the words in that phrase.",
+		"minWords": "Minimum words per caption",
+		"maxWords": "Maximum words per caption",
+		"wordsCount": "{{count}} words",
+		"generate": "Generate",
+		"dialogCancel": "Cancel",
+		"generating": "Generating captions from audio…",
+		"loadingModel": "Loading speech model (first use downloads ~75 MB)…",
+		"busy": "Caption generation is already in progress.",
+		"done": "Added {{count}} captions.",
+		"noneHeard": "No speech was detected.",
+		"noAudio": "This video has no usable audio to transcribe.",
+		"failed": "Could not generate captions.",
+		"truncated": "Only the first {{minutes}} minutes were transcribed."
 	}
 }
diff --git a/src/i18n/locales/tr/editor.json b/src/i18n/locales/tr/editor.json
index 0aece8aec..e5946a9a0 100644
--- a/src/i18n/locales/tr/editor.json
+++ b/src/i18n/locales/tr/editor.json
@@ -41,5 +41,23 @@
 		"description": "Mevcut oturumunuz kaydedildi.",
 		"cancel": "İptal",
 		"confirm": "Onayla"
+	},
+	"autoCaptions": {
+		"button": "Auto captions",
+		"dialogTitle": "Auto captions",
+		"dialogDescription": "Choose roughly how many words each caption shows at once. Timing is spread across the words in that phrase.",
+		"minWords": "Minimum words per caption",
+		"maxWords": "Maximum words per caption",
+		"wordsCount": "{{count}} words",
+		"generate": "Generate",
+		"dialogCancel": "Cancel",
+		"generating": "Generating captions from audio…",
+		"loadingModel": "Loading speech model (first use downloads ~75 MB)…",
+		"busy": "Caption generation is already in progress.",
+		"done": "Added {{count}} captions.",
+		"noneHeard": "No speech was detected.",
+		"noAudio": "This video has no usable audio to transcribe.",
+		"failed": "Could not generate captions.",
+		"truncated": "Only the first {{minutes}} minutes were transcribed."
 	}
 }
diff --git a/src/i18n/locales/zh-CN/editor.json b/src/i18n/locales/zh-CN/editor.json
index f6c02d4c3..ca48fa6df 100644
--- a/src/i18n/locales/zh-CN/editor.json
+++ b/src/i18n/locales/zh-CN/editor.json
@@ -41,5 +41,23 @@
 		"cameraDisconnected": "摄像头已断开连接。",
 		"cameraNotFound": "未找到摄像头。",
 		"permissionDenied": "录屏权限被拒绝。请允许屏幕录制。"
+	},
+	"autoCaptions": {
+		"button": "Auto captions",
+		"dialogTitle": "Auto captions",
+		"dialogDescription": "Choose roughly how many words each caption shows at once. Timing is spread across the words in that phrase.",
+		"minWords": "Minimum words per caption",
+		"maxWords": "Maximum words per caption",
+		"wordsCount": "{{count}} words",
+		"generate": "Generate",
+		"dialogCancel": "Cancel",
+		"generating": "Generating captions from audio…",
+		"loadingModel": "Loading speech model (first use downloads ~75 MB)…",
+		"busy": "Caption generation is already in progress.",
+		"done": "Added {{count}} captions.",
+		"noneHeard": "No speech was detected.",
+		"noAudio": "This video has no usable audio to transcribe.",
+		"failed": "Could not generate captions.",
+		"truncated": "Only the first {{minutes}} minutes were transcribed."
 	}
 }
diff --git a/src/i18n/locales/zh-TW/editor.json b/src/i18n/locales/zh-TW/editor.json
index 21f3ba6f9..2ba567499 100644
--- a/src/i18n/locales/zh-TW/editor.json
+++ b/src/i18n/locales/zh-TW/editor.json
@@ -41,5 +41,23 @@
 		"permissionDenied": "錄影權限被拒絕。請允許螢幕錄製。",
 		"cameraDisconnected": "網路攝影機已中斷連線。",
 		"cameraNotFound": "找不到攝影機。"
+	},
+	"autoCaptions": {
+		"button": "Auto captions",
+		"dialogTitle": "Auto captions",
+		"dialogDescription": "Choose roughly how many words each caption shows at once. Timing is spread across the words in that phrase.",
+		"minWords": "Minimum words per caption",
+		"maxWords": "Maximum words per caption",
+		"wordsCount": "{{count}} words",
+		"generate": "Generate",
+		"dialogCancel": "Cancel",
+		"generating": "Generating captions from audio…",
+		"loadingModel": "Loading speech model (first use downloads ~75 MB)…",
+		"busy": "Caption generation is already in progress.",
+		"done": "Added {{count}} captions.",
+		"noneHeard": "No speech was detected.",
+		"noAudio": "This video has no usable audio to transcribe.",
+		"failed": "Could not generate captions.",
+		"truncated": "Only the first {{minutes}} minutes were transcribed."
 	}
 }
diff --git a/src/lib/captioning/annotationsFromCaptions.ts b/src/lib/captioning/annotationsFromCaptions.ts
new file mode 100644
index 000000000..db26669a3
--- /dev/null
+++ b/src/lib/captioning/annotationsFromCaptions.ts
@@ -0,0 +1,499 @@
+import type { AnnotationRegion, AnnotationTextStyle } from "@/components/video-editor/types";
+
+import type { CaptionSegment } from "./transcribe";
+
+/** Wide lower-third bar; `position.x` is top-left as % of container, so center with (100 − width) / 2. */
+const CAPTION_WIDTH = 92;
+const CAPTION_HEIGHT = 12;
+const CAPTION_BOTTOM_MARGIN = 4;
+
+const CAPTION_POSITION = {
+	x: (100 - CAPTION_WIDTH) / 2,
+	y: 100 - CAPTION_HEIGHT - CAPTION_BOTTOM_MARGIN,
+};
+
+const CAPTION_SIZE = { width: CAPTION_WIDTH, height: CAPTION_HEIGHT };
+
+const CAPTION_STYLE: AnnotationTextStyle = {
+	color: "#ffffff",
+	backgroundColor: "rgba(255, 255, 255, 0)",
+	fontSize: 24,
+	fontFamily: "Inter",
+	fontWeight: "normal",
+	fontStyle: "normal",
+	textDecoration: "none",
+	textAlign: "center",
+};
+
+/**
+ * Nudge caption **starts** earlier (seconds). Whisper onsets are often slightly late vs. what you
+ * hear; do **not** apply the same offset to ends — that pulls lines off-screen too early.
+ */
+const AUTO_CAPTION_START_BIAS_SEC = -0.2;
+
+/**
+ * Extra time held after Whisper’s segment **end** (seconds). Model end times are often early vs.
+ * trailing vowels / room tone; this is separate from `AUTO_CAPTION_START_BIAS_SEC`.
+ */
+const AUTO_CAPTION_END_HOLD_SEC = 0.12;
+
+/** First phrases often sit a bit early in the model; delay only the first two timeline lines (seconds). */
+const FIRST_TWO_CAPTION_DELAY_SEC = 0.32;
+
+/** Inside one Whisper phrase, sub-lines can be shorter (do not steal time from neighbors). */
+const WORD_SPLIT_MIN_SPAN_SEC = 0.02;
+
+/** Brief linger after the last word in a line (seconds); trimmed if it would overlap the next line. */
+const CAPTION_LINE_END_TAIL_SEC = 0.12;
+
+/**
+ * Minimum time between consecutive caption regions on the timeline (seconds). Keeps a visible gap
+ * so blocks do not read as one clip; kept small so we do not erase natural short pauses between phrases.
+ */
+const MIN_CAPTION_TIMELINE_GAP_SEC = 0.024;
+
+/** Same text again with almost no gap or overlap — common Whisper / chunk artifact. */
+const DEDUPE_SAME_TEXT_MAX_GAP_SEC = 0.55;
+
+/**
+ * Same caption content re-emerging shortly after the last time that text appeared (stride /
+ * decoding loops). Wider than `DEDUPE_SAME_TEXT_MAX_GAP_SEC` so non-adjacent duplicates still
+ * collapse after grouping.
+ */
+const SAME_CONTENT_ECHO_MAX_GAP_SEC = 1.15;
+
+function normalizeCaptionKey(text: string): string {
+	return text
+		.trim()
+		.replace(/\s+/g, " ")
+		.replace(/[\u2018\u2019]/g, "'")
+		.replace(/[\u201C\u201D]/g, '"')
+		.toLowerCase()
+		.replace(/[.!?,;:]+$/g, "");
+}
+
+/** Merges duplicate lines when the same wording appears again within `SAME_CONTENT_ECHO_MAX_GAP_SEC`. */
+function collapseSameContentEchoes(segments: CaptionSegment[]): CaptionSegment[] {
+	const sorted = [...segments]
+		.filter((s) => s.text.trim())
+		.sort((a, b) => a.startSec - b.startSec || a.endSec - b.endSec);
+	const out: CaptionSegment[] = [];
+	const lastIndexByKey = new Map<string, number>();
+
+	for (const seg of sorted) {
+		const key = normalizeCaptionKey(seg.text);
+		const hit = lastIndexByKey.get(key);
+		if (hit !== undefined) {
+			const prev = out[hit]!;
+			if (seg.startSec < prev.endSec + SAME_CONTENT_ECHO_MAX_GAP_SEC) {
+				prev.startSec = Math.min(prev.startSec, seg.startSec);
+				prev.endSec = Math.max(prev.endSec, seg.endSec);
+				continue;
+			}
+		}
+		out.push({
+			startSec: seg.startSec,
+			endSec: seg.endSec,
+			text: seg.text.trim(),
+		});
+		lastIndexByKey.set(key, out.length - 1);
+	}
+	return out;
+}
+
+/**
+ * Only merge segments that are almost back-to-back (Whisper often splits mid-phrase with a tiny gap).
+ * Wider gaps are usually silence or missed audio — merging those stretches word timing across dead air.
+ */
+const AUTO_CAPTION_MERGE_OPTIONS = {
+	maxGapSec: 0.16,
+	maxChars: 500,
+	maxBlockDurationSec: 5,
+} as const;
+
+/**
+ * Collapse adjacent duplicate lines (overlapping or tiny gap). Does not merge the same phrase
+ * repeated later in the video when separated by real silence.
+ */
+function dedupeAdjacentCaptionRepeats(segments: CaptionSegment[]): CaptionSegment[] {
+	const sorted = [...segments]
+		.filter((s) => s.text.trim())
+		.sort((a, b) => a.startSec - b.startSec || a.endSec - b.endSec);
+	const out: CaptionSegment[] = [];
+	for (const seg of sorted) {
+		const t = seg.text.trim();
+		const prev = out[out.length - 1];
+		if (prev && normalizeCaptionKey(prev.text) === normalizeCaptionKey(t)) {
+			const overlap = prev.endSec - seg.startSec;
+			const gap = seg.startSec - prev.endSec;
+			if (overlap > 0.015 || gap < DEDUPE_SAME_TEXT_MAX_GAP_SEC) {
+				prev.startSec = Math.min(prev.startSec, seg.startSec);
+				prev.endSec = Math.max(prev.endSec, seg.endSec);
+				continue;
+			}
+		}
+		out.push({ startSec: seg.startSec, endSec: seg.endSec, text: t });
+	}
+	return out;
+}
+
+/**
+ * Apply start bias + end hold, then trim only *real* overlaps (previous end into next start). No
+ * minimum-duration stretching and no snapping starts — that was collapsing gaps and stacking lines
+ * on the timeline.
+ */
+function finalizeCaptionSegmentsForPlayback(segments: CaptionSegment[]): CaptionSegment[] {
+	const OVERLAP_TRIM_SEC = 0.002;
+
+	const sortedRaw = [...segments]
+		.filter((s) => s.text.trim())
+		.sort((a, b) => a.startSec - b.startSec || a.endSec - b.endSec);
+
+	const a = sortedRaw.map((seg, i) => {
+		const earlyHold = i < 2 ? FIRST_TWO_CAPTION_DELAY_SEC : 0;
+		let s = seg.startSec + AUTO_CAPTION_START_BIAS_SEC + earlyHold;
+		let e = seg.endSec + AUTO_CAPTION_END_HOLD_SEC + earlyHold;
+		s = Math.max(0, s);
+		if (e <= s) e = s + 0.02;
+		return { startSec: s, endSec: e, text: seg.text.trim() };
+	});
+
+	for (let i = 1; i < a.length; i++) {
+		if (a[i].startSec < a[i - 1].endSec - OVERLAP_TRIM_SEC) {
+			a[i - 1].endSec = Math.max(a[i - 1].startSec + 1e-4, a[i].startSec);
+		}
+	}
+
+	// Leave at least MIN_CAPTION_TIMELINE_GAP_SEC between lines (shorten previous end only).
+	for (let i = 1; i < a.length; i++) {
+		const needPrevEnd = a[i]!.startSec - MIN_CAPTION_TIMELINE_GAP_SEC;
+		if (a[i - 1]!.endSec > needPrevEnd) {
+			a[i - 1]!.endSec = Math.max(a[i - 1]!.startSec + WORD_SPLIT_MIN_SPAN_SEC, needPrevEnd);
+		}
+	}
+
+	return a;
+}
+
+/** Default min gap between auto-caption blocks on the timeline (ms); matches `MIN_CAPTION_TIMELINE_GAP_SEC`. */
+export const DEFAULT_AUTO_CAPTION_MIN_GAP_MS = Math.round(MIN_CAPTION_TIMELINE_GAP_SEC * 1000);
+
+/**
+ * Enforces a minimum gap between consecutive `auto-caption` regions (by start time). Shortens the
+ * previous region's end when possible; otherwise shifts the following region later so edits on
+ * the timeline cannot squeeze caption blocks completely flush.
+ */
+export function reconcileAutoCaptionTimelineGaps(
+	regions: AnnotationRegion[],
+	minGapMs: number = DEFAULT_AUTO_CAPTION_MIN_GAP_MS,
+): AnnotationRegion[] {
+	const gap = Math.max(0, Math.round(minGapMs));
+	if (regions.length === 0 || gap === 0) return regions;
+
+	const autoCandidates = regions.filter((r) => r.annotationSource === "auto-caption");
+	if (autoCandidates.length <= 1) return regions;
+
+	const sorted = [...autoCandidates].sort((a, b) => a.startMs - b.startMs || a.endMs - b.endMs);
+	const fixed: AnnotationRegion[] = [];
+	let prev = { ...sorted[0]! };
+	fixed.push(prev);
+
+	for (let i = 1; i < sorted.length; i++) {
+		let cur = { ...sorted[i]! };
+		const minStart = prev.endMs + gap;
+
+		if (cur.startMs < minStart) {
+			const newPrevEnd = cur.startMs - gap;
+			if (newPrevEnd >= prev.startMs + 1) {
+				prev = { ...prev, endMs: newPrevEnd };
+				fixed[fixed.length - 1] = prev;
+			} else {
+				const dur = Math.max(1, cur.endMs - cur.startMs);
+				cur = { ...cur, startMs: minStart, endMs: minStart + dur };
+			}
+		}
+
+		fixed.push(cur);
+		prev = cur;
+	}
+
+	const fixedById = new Map(fixed.map((r) => [r.id, r]));
+	return regions.map((r) => fixedById.get(r.id) ?? r);
+}
+
+/** Join phrases that are close in time so the editor does not create dozens of separate overlays. */
+export function mergeAdjacentCaptionSegments(
+	segments: CaptionSegment[],
+	options?: { maxGapSec?: number; maxChars?: number; maxBlockDurationSec?: number },
+): CaptionSegment[] {
+	const maxGapSec = options?.maxGapSec ?? 1.35;
+	const maxChars = options?.maxChars ?? 320;
+	const maxBlockDurationSec = options?.maxBlockDurationSec ?? 12;
+
+	const sorted = [...segments].sort((a, b) => a.startSec - b.startSec);
+	const out: CaptionSegment[] = [];
+
+	for (const seg of sorted) {
+		const text = seg.text.trim();
+		if (!text) continue;
+
+		const prev = out[out.length - 1];
+		if (!prev) {
+			out.push({ startSec: seg.startSec, endSec: seg.endSec, text });
+			continue;
+		}
+
+		const gap = seg.startSec - prev.endSec;
+		const mergedText = `${prev.text} ${text}`.trim();
+		const mergedEnd = Math.max(prev.endSec, seg.endSec);
+		const wouldSpan = mergedEnd - prev.startSec;
+		if (gap <= maxGapSec && mergedText.length <= maxChars && wouldSpan <= maxBlockDurationSec) {
+			prev.endSec = mergedEnd;
+			prev.text = mergedText;
+		} else {
+			out.push({ startSec: seg.startSec, endSec: seg.endSec, text });
+		}
+	}
+
+	return out;
+}
+
+export interface CaptionSegmentLayoutOptions {
+	/** Lower bound on words per on-screen caption (default 2). */
+	minWordsPerCaption?: number;
+	/** Upper bound on words per on-screen caption (default 7). */
+	maxWordsPerCaption?: number;
+	/**
+	 * `word`: each `CaptionSegment` is a single token with Whisper word timestamps (default).
+	 * `phrase`: merged phrase spans; use proportional line splitting inside each span.
+	 */
+	timestampGranularity?: "word" | "phrase";
+}
+
+function computeCaptionLineIndexRanges(
+	wordCount: number,
+	minWords: number,
+	maxWords: number,
+): Array<{ from: number; to: number }> {
+	const minW = Math.max(1, Math.min(Math.floor(minWords), Math.floor(maxWords)));
+	const maxW = Math.max(minW, Math.floor(maxWords));
+	const sliceRanges: Array<{ from: number; to: number }> = [];
+	let i = 0;
+	while (i < wordCount) {
+		const remaining = wordCount - i;
+		if (remaining <= maxW) {
+			if (sliceRanges.length > 0 && remaining < minW) {
+				sliceRanges[sliceRanges.length - 1]!.to = wordCount;
+			} else {
+				sliceRanges.push({ from: i, to: wordCount });
+			}
+			break;
+		}
+
+		let take = maxW;
+		const after = remaining - take;
+		if (after > 0 && after < minW) {
+			take = remaining - minW;
+			if (take < minW) {
+				sliceRanges.push({ from: i, to: wordCount });
+				break;
+			}
+			if (take > maxW) {
+				take = maxW;
+			}
+		}
+		sliceRanges.push({ from: i, to: i + take });
+		i += take;
+	}
+	return sliceRanges;
+}
+
+/**
+ * Groups per-word segments into on-screen lines using each token's Whisper timestamps
+ * (no proportional stretching across a long phrase span).
+ */
+export function groupTimedCaptionWordsIntoLines(
+	segments: CaptionSegment[],
+	minWords: number,
+	maxWords: number,
+): CaptionSegment[] {
+	const words = [...segments]
+		.filter((s) => s.text.trim())
+		.sort((a, b) => a.startSec - b.startSec || a.endSec - b.endSec);
+	if (words.length === 0) return [];
+
+	const minW = Math.max(1, Math.min(Math.floor(minWords), Math.floor(maxWords)));
+	const maxW = Math.max(minW, Math.floor(maxWords));
+	const ranges = computeCaptionLineIndexRanges(words.length, minW, maxW);
+	const out: CaptionSegment[] = [];
+	for (const { from, to } of ranges) {
+		const slice = words.slice(from, to);
+		const s = slice[0]!.startSec;
+		const rawEnd = slice[slice.length - 1]!.endSec;
+		const e = Math.max(s + WORD_SPLIT_MIN_SPAN_SEC, rawEnd + CAPTION_LINE_END_TAIL_SEC);
+		out.push({
+			startSec: s,
+			endSec: e,
+			text: slice.map((w) => w.text.trim()).join(" "),
+		});
+	}
+	for (let i = 0; i < out.length - 1; i++) {
+		if (out[i]!.endSec > out[i + 1]!.startSec + 1e-3) {
+			out[i]!.endSec = Math.max(
+				out[i]!.startSec + WORD_SPLIT_MIN_SPAN_SEC,
+				out[i + 1]!.startSec - 1e-4,
+			);
+		}
+	}
+	return out;
+}
+
+/**
+ * Splits each merged transcription span into shorter captions with about
+ * `minWords`–`maxWords` words. Times are interpolated by character weight inside the span.
+ */
+export function splitMergedCaptionsByWordBounds(
+	merged: CaptionSegment[],
+	minWords: number,
+	maxWords: number,
+): CaptionSegment[] {
+	const minW = Math.max(1, Math.min(Math.floor(minWords), Math.floor(maxWords)));
+	const maxW = Math.max(minW, Math.floor(maxWords));
+	const out: CaptionSegment[] = [];
+
+	for (const seg of merged) {
+		const words = seg.text.trim().split(/\s+/).filter(Boolean);
+		if (words.length === 0) continue;
+
+		if (words.length <= maxW) {
+			out.push({
+				startSec: seg.startSec,
+				endSec: seg.endSec,
+				text: words.join(" "),
+			});
+			continue;
+		}
+
+		out.push(...splitOneSegmentByWordBounds(seg.startSec, seg.endSec, words, minW, maxW));
+	}
+
+	return out;
+}
+
+function splitOneSegmentByWordBounds(
+	startSec: number,
+	endSec: number,
+	words: string[],
+	minWords: number,
+	maxWords: number,
+): CaptionSegment[] {
+	const sliceRanges = computeCaptionLineIndexRanges(words.length, minWords, maxWords);
+
+	const dur = Math.max(endSec - startSec, 0.05);
+	const weights = words.map((w) => Math.max(1, w.length));
+	const totalW = weights.reduce((a, b) => a + b, 0);
+
+	const weightSum = (from: number, to: number) => {
+		let s = 0;
+		for (let k = from; k < to; k++) s += weights[k] ?? 0;
+		return s;
+	};
+
+	const result: CaptionSegment[] = [];
+	let prevEnd = startSec;
+	for (const { from, to } of sliceRanges) {
+		const wb = weightSum(0, from);
+		const ws = weightSum(from, to);
+		let s = startSec + (wb / totalW) * dur;
+		let e = startSec + ((wb + ws) / totalW) * dur;
+		s = Math.max(s, prevEnd);
+		e = Math.max(s + WORD_SPLIT_MIN_SPAN_SEC, e);
+		e = Math.min(e, endSec);
+		if (e <= s) {
+			e = Math.min(endSec, s + WORD_SPLIT_MIN_SPAN_SEC);
+		}
+		prevEnd = e;
+		result.push({
+			startSec: s,
+			endSec: e,
+			text: words.slice(from, to).join(" "),
+		});
+	}
+	if (result.length > 0) {
+		result[result.length - 1].endSec = endSec;
+		for (let i = 0; i < result.length - 1; i++) {
+			if (result[i].endSec > result[i + 1].startSec + 0.002) {
+				result[i].endSec = Math.max(result[i].startSec + 1e-4, result[i + 1].startSec);
+			}
+		}
+	}
+	return result;
+}
+
+export function captionSegmentsToAnnotationRegions(
+	segments: CaptionSegment[],
+	startNumericId: number,
+	startZIndex: number,
+	layout?: CaptionSegmentLayoutOptions,
+): { regions: AnnotationRegion[]; nextNumericId: number; nextZIndex: number } {
+	// Do not echo-collapse raw word tokens before grouping: repeated words ("I … I") share a
+	// normalized key and would merge spans while keeping only the first token's text.
+	const dedupedIn = dedupeAdjacentCaptionRepeats(segments);
+	const minW = layout?.minWordsPerCaption ?? 2;
+	const maxW = layout?.maxWordsPerCaption ?? 7;
+	const granularity = layout?.timestampGranularity ?? "word";
+
+	const grouped =
+		granularity === "phrase"
+			? splitMergedCaptionsByWordBounds(
+					mergeAdjacentCaptionSegments(dedupedIn, AUTO_CAPTION_MERGE_OPTIONS),
+					minW,
+					maxW,
+				)
+			: groupTimedCaptionWordsIntoLines(dedupedIn, minW, maxW);
+
+	const dedupedOut = dedupeAdjacentCaptionRepeats(grouped);
+	const rinsedOut = collapseSameContentEchoes(dedupedOut);
+	const finalized = finalizeCaptionSegmentsForPlayback(rinsedOut);
+
+	let nid = startNumericId;
+	let z = startZIndex;
+	const regions: AnnotationRegion[] = [];
+
+	for (const seg of finalized) {
+		const startMs = Math.round(seg.startSec * 1000);
+		const endMs = Math.max(Math.round(seg.endSec * 1000), startMs + 1);
+		regions.push({
+			id: `annotation-${nid++}`,
+			startMs,
+			endMs,
+			type: "text",
+			content: seg.text,
+			annotationSource: "auto-caption",
+			position: { ...CAPTION_POSITION },
+			size: { ...CAPTION_SIZE },
+			style: { ...CAPTION_STYLE },
+			zIndex: z++,
+		});
+	}
+
+	return {
+		regions: reconcileAutoCaptionTimelineGaps(regions),
+		nextNumericId: nid,
+		nextZIndex: z,
+	};
+}
+
+export function maxAnnotationNumericId(regions: AnnotationRegion[]): number {
+	let max = 0;
+	for (const r of regions) {
+		const m = /^annotation-(\d+)$/.exec(r.id);
+		if (m) max = Math.max(max, Number.parseInt(m[1], 10));
+	}
+	return max;
+}
+
+export function maxAnnotationZIndex(regions: AnnotationRegion[]): number {
+	if (regions.length === 0) return 0;
+	return Math.max(...regions.map((r) => r.zIndex));
+}
diff --git a/src/lib/captioning/captionConstants.ts b/src/lib/captioning/captionConstants.ts
new file mode 100644
index 000000000..1bacb7cc7
--- /dev/null
+++ b/src/lib/captioning/captionConstants.ts
@@ -0,0 +1,2 @@
+/** Max audio length for auto-captions (decode + transcribe); keep demuxer read aligned with this. */
+export const MAX_CAPTION_AUDIO_SEC = 4 * 60 * 60;
diff --git a/src/lib/captioning/extractMono16k.ts b/src/lib/captioning/extractMono16k.ts
new file mode 100644
index 000000000..9e932ea3d
--- /dev/null
+++ b/src/lib/captioning/extractMono16k.ts
@@ -0,0 +1,159 @@
+import { MAX_CAPTION_AUDIO_SEC } from "./captionConstants";
+import { extractMonoPcmViaWebDemuxer } from "./extractMono16kWebDemuxer";
+
+export { MAX_CAPTION_AUDIO_SEC };
+
+const FETCH_TIMEOUT_MS = 120_000;
+
+async function fetchWithTimeout(url: string, signal?: AbortSignal): Promise<Response> {
+	const ctrl = new AbortController();
+	const timer = window.setTimeout(() => ctrl.abort(), FETCH_TIMEOUT_MS);
+	const onAbort = () => ctrl.abort();
+	if (signal) {
+		if (signal.aborted) ctrl.abort();
+		else signal.addEventListener("abort", onAbort, { once: true });
+	}
+	try {
+		return await fetch(url, { signal: ctrl.signal });
+	} finally {
+		window.clearTimeout(timer);
+		if (signal) signal.removeEventListener("abort", onAbort);
+	}
+}
+
+/**
+ * Load the editor video the same way as `StreamingVideoDecoder`:
+ * Electron `readBinaryFile` for local paths (fetch(file://) is unreliable in the renderer),
+ * otherwise HTTP / blob / data URLs via fetch.
+ */
+async function loadSourceVideoFile(videoUrl: string, signal?: AbortSignal): Promise<File> {
+	const isRemoteUrl = /^(https?:|blob:|data:)/i.test(videoUrl);
+
+	if (!isRemoteUrl && window.electronAPI?.readBinaryFile) {
+		const result = await window.electronAPI.readBinaryFile(videoUrl);
+		if (!result.success || !result.data) {
+			throw new Error(result.message || result.error || "Failed to read source video");
+		}
+		const filename = (result.path || videoUrl).split(/[\\/]/).pop() || "video";
+		return new File([result.data], filename, { type: "video/webm" });
+	}
+
+	const response = await fetchWithTimeout(videoUrl, signal);
+	if (!response.ok) {
+		throw new Error(`Failed to load video for captions: ${response.status} ${response.statusText}`);
+	}
+	const blob = await response.blob();
+	if (signal?.aborted) throw new DOMException("Aborted", "AbortError");
+	const filename = videoUrl.split("/").pop() || "video";
+	return new File([blob], filename, { type: blob.type || "video/webm" });
+}
+
+function mixToMono(audioBuffer: AudioBuffer): Float32Array {
+	const { length, numberOfChannels } = audioBuffer;
+	const out = new Float32Array(length);
+	if (numberOfChannels === 0) return out;
+	for (let i = 0; i < length; i++) {
+		let sum = 0;
+		for (let c = 0; c < numberOfChannels; c++) {
+			sum += audioBuffer.getChannelData(c)[i];
+		}
+		out[i] = sum / numberOfChannels;
+	}
+	return out;
+}
+
+async function resampleMono(
+	mono: Float32Array,
+	fromRate: number,
+	toRate: number,
+	signal?: AbortSignal,
+): Promise<Float32Array> {
+	if (signal?.aborted) throw new DOMException("Aborted", "AbortError");
+	if (fromRate === toRate) return mono;
+	const durationSec = mono.length / fromRate;
+	const outLength = Math.max(1, Math.ceil(durationSec * toRate));
+	const offline = new OfflineAudioContext(1, outLength, toRate);
+	const buf = offline.createBuffer(1, mono.length, fromRate);
+	buf.copyToChannel(Float32Array.from(mono), 0);
+	const src = offline.createBufferSource();
+	src.buffer = buf;
+	src.connect(offline.destination);
+	src.start(0);
+	const rendered = await offline.startRendering();
+	if (signal?.aborted) throw new DOMException("Aborted", "AbortError");
+	return rendered.getChannelData(0).slice();
+}
+
+async function truncateAndResampleTo16k(
+	mono: Float32Array,
+	fromRate: number,
+	durationSec: number,
+	signal?: AbortSignal,
+): Promise<{ samples: Float32Array; truncated: boolean; durationSec: number }> {
+	let truncated = false;
+	let work = mono;
+	if (durationSec > MAX_CAPTION_AUDIO_SEC) {
+		const maxSamples = Math.floor(MAX_CAPTION_AUDIO_SEC * fromRate);
+		work = mono.subarray(0, Math.min(mono.length, maxSamples));
+		truncated = true;
+	}
+
+	const samples = await resampleMono(work, fromRate, 16_000, signal);
+	return { samples, truncated, durationSec };
+}
+
+/**
+ * Decode the video's audio track to mono 16 kHz float samples (Whisper input).
+ * Prefers `decodeAudioData` when the container is supported; otherwise uses the same
+ * web-demuxer + AudioDecoder path as export.
+ */
+export async function extractMono16kFromVideoUrl(
+	videoUrl: string,
+	options?: { signal?: AbortSignal },
+): Promise<{ samples: Float32Array; truncated: boolean; durationSec: number }> {
+	const file = await loadSourceVideoFile(videoUrl, options?.signal);
+
+	/** When this returns null, use web-demuxer + AudioDecoder (same as export). */
+	const tryDecodeAudioDataPath = async (): Promise<{
+		samples: Float32Array;
+		truncated: boolean;
+		durationSec: number;
+	} | null> => {
+		const audioContext = new AudioContext();
+		try {
+			const ab = await file.arrayBuffer();
+			if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
+			const audioBuffer = await audioContext.decodeAudioData(ab.slice(0));
+			if (
+				audioBuffer.numberOfChannels === 0 ||
+				audioBuffer.length === 0 ||
+				!Number.isFinite(audioBuffer.duration) ||
+				audioBuffer.duration <= 0
+			) {
+				return null;
+			}
+			const durationSec = audioBuffer.duration;
+			const mono = mixToMono(audioBuffer);
+			const fromRate = audioBuffer.sampleRate;
+			const out = await truncateAndResampleTo16k(mono, fromRate, durationSec, options?.signal);
+			// decodeAudioData can resolve for some WebM/Matroska inputs yet yield almost no usable
+			// PCM; captions only run the demuxer path on throw today, so we never recover.
+			if (out.samples.length < 800) {
+				return null;
+			}
+			return out;
+		} catch {
+			return null;
+		} finally {
+			await audioContext.close().catch(() => undefined);
+		}
+	};
+
+	const primary = await tryDecodeAudioDataPath();
+	if (primary) {
+		return primary;
+	}
+
+	const pcm = await extractMonoPcmViaWebDemuxer(file, options?.signal);
+	return truncateAndResampleTo16k(pcm.mono, pcm.sampleRate, pcm.durationSec, options?.signal);
+}
diff --git a/src/lib/captioning/extractMono16kWebDemuxer.ts b/src/lib/captioning/extractMono16kWebDemuxer.ts
new file mode 100644
index 000000000..4898ca011
--- /dev/null
+++ b/src/lib/captioning/extractMono16kWebDemuxer.ts
@@ -0,0 +1,187 @@
+import { WebDemuxer } from "web-demuxer";
+
+import { MAX_CAPTION_AUDIO_SEC } from "./captionConstants";
+
+const DECODE_QUEUE_BACKPRESSURE = 20;
+const SOURCE_LOAD_TIMEOUT_MS = 60_000;
+const READ_END_PADDING_SEC = 0.5;
+
+function webDemuxerWasmUrl(): string {
+	return new URL("../exporter/wasm/web-demuxer.wasm", window.location.href).href;
+}
+
+function audioDataFrameToMono(frame: AudioData): Float32Array {
+	const frames = frame.numberOfFrames;
+	const ch = frame.numberOfChannels;
+	const out = new Float32Array(frames);
+	const fmt = frame.format || "";
+	const planar = fmt.includes("planar");
+
+	if (planar) {
+		const plane = new Float32Array(frames);
+		for (let c = 0; c < ch; c++) {
+			frame.copyTo(plane, { planeIndex: c });
+			for (let i = 0; i < frames; i++) {
+				out[i] += plane[i];
+			}
+		}
+		for (let i = 0; i < frames; i++) {
+			out[i] /= ch;
+		}
+	} else {
+		const interleaved = new Float32Array(frames * ch);
+		frame.copyTo(interleaved, { planeIndex: 0 });
+		for (let i = 0; i < frames; i++) {
+			let sum = 0;
+			for (let c = 0; c < ch; c++) {
+				sum += interleaved[i * ch + c];
+			}
+			out[i] = sum / ch;
+		}
+	}
+	return out;
+}
+
+function mergeDecodedAudioToMonoLinear(
+	frames: AudioData[],
+	sampleRate: number,
+	durationSec: number,
+): Float32Array {
+	const sorted = [...frames].sort((a, b) => a.timestamp - b.timestamp);
+	const totalSamples = Math.max(1, Math.ceil(durationSec * sampleRate));
+	const acc = new Float32Array(totalSamples);
+	const weight = new Float32Array(totalSamples);
+
+	for (const frame of sorted) {
+		const startSample = Math.round((frame.timestamp / 1e6) * sampleRate);
+		const slice = audioDataFrameToMono(frame);
+		for (let i = 0; i < slice.length; i++) {
+			const pos = startSample + i;
+			if (pos >= 0 && pos < totalSamples) {
+				acc[pos] += slice[i];
+				weight[pos] += 1;
+			}
+		}
+		frame.close();
+	}
+
+	for (let i = 0; i < totalSamples; i++) {
+		if (weight[i] > 0) {
+			acc[i] /= weight[i];
+		}
+	}
+	return acc;
+}
+
+function withTimeout<T>(promise: Promise<T>, ms: number, message: string): Promise<T> {
+	return new Promise<T>((resolve, reject) => {
+		const id = window.setTimeout(() => reject(new Error(message)), ms);
+		promise
+			.then((v) => {
+				window.clearTimeout(id);
+				resolve(v);
+			})
+			.catch((e) => {
+				window.clearTimeout(id);
+				reject(e instanceof Error ? e : new Error(String(e)));
+			});
+	});
+}
+
+/**
+ * Demux + WebCodecs audio decode (same stack as export). Use when
+ * `decodeAudioData` cannot handle the container (e.g. WebM with video).
+ */
+export async function extractMonoPcmViaWebDemuxer(
+	file: File,
+	signal?: AbortSignal,
+): Promise<{ mono: Float32Array; sampleRate: number; durationSec: number }> {
+	const demuxer = new WebDemuxer({ wasmFilePath: webDemuxerWasmUrl() });
+	await withTimeout(
+		demuxer.load(file),
+		SOURCE_LOAD_TIMEOUT_MS,
+		"Timed out while parsing the source video for captions.",
+	);
+
+	if (signal?.aborted) throw new DOMException("Aborted", "AbortError");
+
+	const mediaInfo = await withTimeout(
+		demuxer.getMediaInfo(),
+		SOURCE_LOAD_TIMEOUT_MS,
+		"Timed out while reading media info for captions.",
+	);
+
+	const reportedDurationSec =
+		Number.isFinite(mediaInfo.duration) && mediaInfo.duration > 0 ? mediaInfo.duration : 0;
+
+	let audioConfig: AudioDecoderConfig;
+	try {
+		audioConfig = await demuxer.getDecoderConfig("audio");
+	} catch {
+		throw new Error("No audio track found in this video.");
+	}
+
+	const codecCheck = await AudioDecoder.isConfigSupported(audioConfig);
+	if (!codecCheck.supported) {
+		throw new Error(`Audio codec not supported for captions: ${audioConfig.codec}`);
+	}
+
+	const sampleRate = audioConfig.sampleRate || 48_000;
+
+	// Many WebM/Matroska files report a too-short duration; capping read at reported time stops
+	// demux early and mergeDecodedAudioToMonoLinear clips everything past that. Read up to the
+	// same ceiling as caption decode (demuxer stops when the track ends).
+	const readEndSec = MAX_CAPTION_AUDIO_SEC + READ_END_PADDING_SEC;
+	const decodedFrames: AudioData[] = [];
+
+	const decoder = new AudioDecoder({
+		output: (data: AudioData) => decodedFrames.push(data),
+		error: (e: DOMException) => console.error("[captioning] AudioDecoder error:", e),
+	});
+	decoder.configure(audioConfig);
+
+	const reader = demuxer.read("audio", 0, readEndSec).getReader();
+	try {
+		while (!signal?.aborted) {
+			const { done, value: chunk } = await reader.read();
+			if (done || !chunk) break;
+			decoder.decode(chunk);
+			while (decoder.decodeQueueSize > DECODE_QUEUE_BACKPRESSURE && !signal?.aborted) {
+				await new Promise((r) => setTimeout(r, 1));
+			}
+		}
+	} finally {
+		try {
+			await reader.cancel();
+		} catch {
+			/* already closed */
+		}
+	}
+
+	if (decoder.state === "configured") {
+		await decoder.flush();
+		decoder.close();
+	}
+
+	if (signal?.aborted) {
+		for (const f of decodedFrames) f.close();
+		throw new DOMException("Aborted", "AbortError");
+	}
+
+	if (decodedFrames.length === 0) {
+		throw new Error("Decoded zero audio frames from this video.");
+	}
+
+	let maxEndUs = 0;
+	for (const f of decodedFrames) {
+		const end = f.timestamp + (f.duration ?? 0);
+		if (end > maxEndUs) maxEndUs = end;
+	}
+	const inferredDurationSec = maxEndUs / 1e6;
+	// Prefer extent implied by decoded frames (fixes bad container duration). If frames lack
+	// duration, fall back to reported metadata.
+	const durationSec = inferredDurationSec > 0.02 ? inferredDurationSec : reportedDurationSec;
+
+	const mono = mergeDecodedAudioToMonoLinear(decodedFrames, sampleRate, durationSec);
+	return { mono, sampleRate, durationSec };
+}
diff --git a/src/lib/captioning/index.ts b/src/lib/captioning/index.ts
new file mode 100644
index 000000000..cc2e2a3a6
--- /dev/null
+++ b/src/lib/captioning/index.ts
@@ -0,0 +1,17 @@
+export type { CaptionSegmentLayoutOptions } from "./annotationsFromCaptions";
+export {
+	captionSegmentsToAnnotationRegions,
+	DEFAULT_AUTO_CAPTION_MIN_GAP_MS,
+	groupTimedCaptionWordsIntoLines,
+	mergeAdjacentCaptionSegments,
+	reconcileAutoCaptionTimelineGaps,
+	splitMergedCaptionsByWordBounds,
+} from "./annotationsFromCaptions";
+export { extractMono16kFromVideoUrl, MAX_CAPTION_AUDIO_SEC } from "./extractMono16k";
+export { shiftTrimRegionsMsForCaptionBuffer, trimLeadingSilenceMono16k } from "./leadingSilence";
+export type {
+	CaptionSegment,
+	CaptionTimestampGranularity,
+	TranscribeMono16kResult,
+} from "./transcribe";
+export { transcribeMono16kToSegments } from "./transcribe";
diff --git a/src/lib/captioning/leadingSilence.ts b/src/lib/captioning/leadingSilence.ts
new file mode 100644
index 000000000..5ad7c50f4
--- /dev/null
+++ b/src/lib/captioning/leadingSilence.ts
@@ -0,0 +1,83 @@
+/** Caption path is always mono 16 kHz after `extractMono16kFromVideoUrl`. */
+import type { TrimRegion } from "@/components/video-editor/types";
+
+const SAMPLE_RATE = 16_000;
+
+/** Window length for peak detection (~50 ms). */
+const WINDOW_SAMPLES = 800;
+
+/** Coarse hop so long intros scan quickly (~50 ms steps). */
+const HOP_SAMPLES = 800;
+
+/** Max |sample| in a window below this counts as silence (float PCM ~[-1, 1]). */
+const PEAK_THRESHOLD = 0.012;
+
+/** Keep a little audio before the first peak so word onsets are not clipped. */
+const PRE_ROLL_SEC = 0.12;
+
+/** Do not scan more than this much audio for leading silence (performance + pathological files). */
+const MAX_LEADING_SCAN_SEC = 15 * 60;
+
+/**
+ * Drops quiet audio at the beginning so Whisper is not fed a long silent prefix (which can skew
+ * the first phrase and wastes work). Returned `trimSec` must be added back to every segment time.
+ */
+export function trimLeadingSilenceMono16k(samples: Float32Array): {
+	samples: Float32Array;
+	trimSec: number;
+} {
+	if (samples.length < WINDOW_SAMPLES) {
+		return { samples, trimSec: 0 };
+	}
+
+	const maxIndex = Math.min(
+		samples.length - WINDOW_SAMPLES,
+		Math.floor(MAX_LEADING_SCAN_SEC * SAMPLE_RATE),
+	);
+
+	let firstSpeechSample = -1;
+	for (let i = 0; i <= maxIndex; i += HOP_SAMPLES) {
+		let peak = 0;
+		for (let j = 0; j < WINDOW_SAMPLES; j++) {
+			peak = Math.max(peak, Math.abs(samples[i + j]!));
+		}
+		if (peak > PEAK_THRESHOLD) {
+			firstSpeechSample = i;
+			break;
+		}
+	}
+
+	if (firstSpeechSample <= 0) {
+		return { samples, trimSec: 0 };
+	}
+
+	const preRollSamples = Math.round(PRE_ROLL_SEC * SAMPLE_RATE);
+	const start = Math.max(0, firstSpeechSample - preRollSamples);
+	return {
+		samples: samples.subarray(start),
+		trimSec: start / SAMPLE_RATE,
+	};
+}
+
+/**
+ * When audio is trimmed from the front, Whisper times are relative to the shortened buffer.
+ * Shift trim regions by the same offset so `segmentOverlapsTrim` still uses consistent coordinates.
+ */
+export function shiftTrimRegionsMsForCaptionBuffer(
+	regions: TrimRegion[],
+	trimMs: number,
+): TrimRegion[] {
+	if (trimMs <= 0) return regions;
+	return regions
+		.map((r) => ({
+			...r,
+			startMs: r.startMs - trimMs,
+			endMs: r.endMs - trimMs,
+		}))
+		.map((r) => ({
+			...r,
+			startMs: Math.max(0, r.startMs),
+			endMs: Math.max(0, r.endMs),
+		}))
+		.filter((r) => r.endMs > r.startMs);
+}
diff --git a/src/lib/captioning/transcribe.ts b/src/lib/captioning/transcribe.ts
new file mode 100644
index 000000000..96b235ffc
--- /dev/null
+++ b/src/lib/captioning/transcribe.ts
@@ -0,0 +1,286 @@
+import type { TrimRegion } from "@/components/video-editor/types";
+
+export interface CaptionSegment {
+	startSec: number;
+	endSec: number;
+	text: string;
+}
+
+/** How caption layout should interpret `CaptionSegment` times from `transcribeMono16kToSegments`. */
+export type CaptionTimestampGranularity = "word" | "phrase";
+
+export interface TranscribeMono16kResult {
+	segments: CaptionSegment[];
+	granularity: CaptionTimestampGranularity;
+}
+
+function segmentOverlapsTrim(startMs: number, endMs: number, trims: TrimRegion[]): boolean {
+	return trims.some((t) => startMs < t.endMs && endMs > t.startMs);
+}
+
+/**
+ * ONNX Runtime's wasm bundle treats `process.versions.node` (present in Electron's
+ * renderer) as Node and tries `require("fs")`, which Vite does not support. Mask it
+ * only while Transformers / ORT run.
+ */
+function withoutNodeVersion<T>(fn: () => Promise<T>): Promise<T> {
+	const versions =
+		typeof process !== "undefined" && process.versions && typeof process.versions === "object"
+			? process.versions
+			: null;
+	const hadNode = versions !== null && "node" in versions;
+	const savedNode = hadNode ? (versions as { node?: string }).node : undefined;
+	if (hadNode && versions) {
+		try {
+			Reflect.deleteProperty(versions, "node");
+		} catch {
+			(versions as { node?: string }).node = undefined;
+		}
+	}
+	return fn().finally(() => {
+		if (hadNode && versions && savedNode !== undefined) {
+			(versions as { node: string }).node = savedNode;
+		}
+	});
+}
+
+/** Whisper runs with internal 30s chunks; keep each forward pass bounded for WASM memory. */
+const TRANSCRIBE_SLICE_SAMPLES = 12 * 60 * 16_000;
+
+/** Very short slices are skipped in the multi-slice loop unless padded (see `padTailSliceForTranscribe`). */
+const MIN_TRANSCRIBE_SLICE_SAMPLES = 800;
+
+/**
+ * Pad a short tail slice so Whisper still runs; timestamps are clamped with `realDurationSec` so
+ * padding does not extend perceived audio on the timeline.
+ */
+function padTailSliceForTranscribe(samples: Float32Array): {
+	slice: Float32Array;
+	realDurationSec: number;
+} {
+	const realDurationSec = samples.length / 16_000;
+	if (samples.length >= MIN_TRANSCRIBE_SLICE_SAMPLES) {
+		return { slice: samples, realDurationSec };
+	}
+	const padded = new Float32Array(MIN_TRANSCRIBE_SLICE_SAMPLES);
+	padded.set(samples);
+	return { slice: padded, realDurationSec };
+}
+
+function segmentsFromTranscriberChunks(
+	chunks: Array<{ timestamp?: [number | null, number | null]; text?: unknown }>,
+	timeOffsetSec: number,
+	trims: TrimRegion[],
+	audioDurationSec: number,
+): CaptionSegment[] {
+	const sorted = [...chunks].sort((x, y) => {
+		const ax = x.timestamp?.[0];
+		const ay = y.timestamp?.[0];
+		const na = typeof ax === "number" ? ax : -1;
+		const nb = typeof ay === "number" ? ay : -1;
+		return na - nb;
+	});
+
+	const segments: CaptionSegment[] = [];
+
+	for (let idx = 0; idx < sorted.length; idx++) {
+		const c = sorted[idx]!;
+		const ts = c.timestamp as [number | null, number | null] | undefined;
+		if (!ts) continue;
+		let a = ts[0];
+		let b = ts[1];
+		if (a == null) a = 0;
+		a = Math.max(0, a);
+		if (b == null) {
+			let nextStart: number | null = null;
+			for (let j = idx + 1; j < sorted.length; j++) {
+				const na = sorted[j]?.timestamp?.[0];
+				if (typeof na === "number") {
+					nextStart = na;
+					break;
+				}
+			}
+			b = nextStart ?? audioDurationSec;
+		}
+		if (b <= a) {
+			b = Math.min(a + 0.25, audioDurationSec);
+		}
+		b = Math.min(b, audioDurationSec);
+
+		const text = String(c.text ?? "")
+			.replace(/\s+/g, " ")
+			.trim();
+		if (!text) continue;
+
+		const startSec = a + timeOffsetSec;
+		const sliceEnd = timeOffsetSec + audioDurationSec;
+		const endSec = Math.min(Math.max(startSec + 0.08, b + timeOffsetSec), sliceEnd);
+		const startMs = Math.round(startSec * 1000);
+		const endMs = Math.round(endSec * 1000);
+		if (segmentOverlapsTrim(startMs, endMs, trims)) continue;
+
+		segments.push({ startSec, endSec, text });
+	}
+
+	segments.sort((u, v) => u.startSec - v.startSec || u.endSec - v.endSec);
+	const rawDeduped: CaptionSegment[] = [];
+	const CHUNK_DUP_MAX_GAP_SEC = 0.42;
+	for (const seg of segments) {
+		const prev = rawDeduped[rawDeduped.length - 1];
+		if (prev && prev.text === seg.text && seg.startSec <= prev.endSec + CHUNK_DUP_MAX_GAP_SEC) {
+			prev.endSec = Math.max(prev.endSec, seg.endSec);
+			prev.startSec = Math.min(prev.startSec, seg.startSec);
+			continue;
+		}
+		rawDeduped.push(seg);
+	}
+	return rawDeduped;
+}
+
+async function runTranscriberOnSlice(
+	transcriber: (audio: Float32Array, opts: Record<string, unknown>) => Promise<unknown>,
+	samples: Float32Array,
+	opts: { forceFullSequences: boolean },
+): Promise<unknown> {
+	const durationSec = samples.length / 16_000;
+	// Only chunk long clips; short-audio chunking regressed some Whisper.js runs (empty chunks).
+	const chunking = durationSec > 30 ? { chunk_length_s: 30, stride_length_s: 5 } : {};
+	return transcriber(samples, {
+		return_timestamps: true,
+		force_full_sequences: opts.forceFullSequences,
+		...chunking,
+	});
+}
+
+function getChunksFromTranscriberResult(result: unknown): Array<{
+	timestamp?: [number | null, number | null];
+	text?: unknown;
+}> {
+	if (result == null) return [];
+	if (Array.isArray(result)) {
+		const out: Array<{ timestamp?: [number | null, number | null]; text?: unknown }> = [];
+		for (const item of result) {
+			const chunks = (item as { chunks?: unknown })?.chunks;
+			if (Array.isArray(chunks)) out.push(...chunks);
+		}
+		return out;
+	}
+	const chunks = (result as { chunks?: unknown })?.chunks;
+	return Array.isArray(chunks) ? chunks : [];
+}
+
+/** Prefer `chunks`; if the model only returned top-level `text`, synthesize one span for timing. */
+function extractChunksFromAsrResult(result: unknown): Array<{
+	timestamp?: [number | null, number | null];
+	text?: unknown;
+}> {
+	const fromChunks = getChunksFromTranscriberResult(result);
+	if (fromChunks.length > 0) return fromChunks;
+	const single = Array.isArray(result) ? result[0] : result;
+	const text =
+		typeof (single as { text?: unknown })?.text === "string"
+			? String((single as { text: string }).text).trim()
+			: "";
+	if (text) {
+		return [{ timestamp: [0, null], text }];
+	}
+	return [];
+}
+
+/**
+ * Runs Whisper in-browser via Transformers.js. First run downloads model weights.
+ * Long audio is split into slices so one forward pass does not exhaust WASM memory;
+ * timestamps are shifted to the full timeline.
+ */
+export async function transcribeMono16kToSegments(
+	samples: Float32Array,
+	options?: {
+		trimRegions?: TrimRegion[];
+		onStatus?: (phase: "model" | "transcribe") => void;
+		signal?: AbortSignal;
+	},
+): Promise<TranscribeMono16kResult> {
+	return withoutNodeVersion(async () => {
+		const { pipeline, env } = await import("@xenova/transformers");
+		env.allowLocalModels = false;
+
+		if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
+
+		options?.onStatus?.("model");
+		// Default tiny weights only: the `output_attentions` revision has regressed inference for
+		// some environments (empty chunks / thrown errors) while phrase mode works on this model.
+		const transcriber = await pipeline("automatic-speech-recognition", "Xenova/whisper-tiny");
+
+		if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
+
+		const trims = options?.trimRegions ?? [];
+		options?.onStatus?.("transcribe");
+
+		const transcribeOne = async (
+			ignoreTrims: boolean,
+			forceFullSequences: boolean,
+		): Promise<CaptionSegment[]> => {
+			try {
+				const activeTrims = ignoreTrims ? [] : trims;
+				if (samples.length <= TRANSCRIBE_SLICE_SAMPLES) {
+					const { slice, realDurationSec } = padTailSliceForTranscribe(samples);
+					const result = await runTranscriberOnSlice(transcriber, slice, {
+						forceFullSequences,
+					});
+					return segmentsFromTranscriberChunks(
+						extractChunksFromAsrResult(result),
+						0,
+						activeTrims,
+						realDurationSec,
+					);
+				}
+
+				const all: CaptionSegment[] = [];
+				for (let offset = 0; offset < samples.length; offset += TRANSCRIBE_SLICE_SAMPLES) {
+					if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
+					const end = Math.min(offset + TRANSCRIBE_SLICE_SAMPLES, samples.length);
+					const sliceRaw = samples.subarray(offset, end);
+					const isFinalSlice = end >= samples.length;
+					if (sliceRaw.length === 0) continue;
+					if (sliceRaw.length < MIN_TRANSCRIBE_SLICE_SAMPLES && !isFinalSlice) continue;
+
+					const { slice, realDurationSec } =
+						sliceRaw.length < MIN_TRANSCRIBE_SLICE_SAMPLES && isFinalSlice
+							? padTailSliceForTranscribe(sliceRaw)
+							: { slice: sliceRaw, realDurationSec: sliceRaw.length / 16_000 };
+
+					const result = await runTranscriberOnSlice(transcriber, slice, {
+						forceFullSequences,
+					});
+					const tOff = offset / 16_000;
+					all.push(
+						...segmentsFromTranscriberChunks(
+							extractChunksFromAsrResult(result),
+							tOff,
+							activeTrims,
+							realDurationSec,
+						),
+					);
+				}
+				return all;
+			} catch (e) {
+				if (e instanceof DOMException && e.name === "AbortError") throw e;
+				console.warn("[captioning] Whisper pass failed:", e);
+				return [];
+			}
+		};
+
+		let segments = await transcribeOne(false, true);
+		if (segments.length === 0) {
+			segments = await transcribeOne(false, false);
+		}
+		if (segments.length === 0 && trims.length > 0) {
+			segments = await transcribeOne(true, true);
+			if (segments.length === 0) {
+				segments = await transcribeOne(true, false);
+			}
+		}
+
+		return { segments, granularity: "phrase" };
+	});
+}
diff --git a/src/lib/vite-stubs/empty-node-module.ts b/src/lib/vite-stubs/empty-node-module.ts
new file mode 100644
index 000000000..16ee52688
--- /dev/null
+++ b/src/lib/vite-stubs/empty-node-module.ts
@@ -0,0 +1,7 @@
+/**
+ * Default export with no enumerable keys. Used as a Vite alias target for Node
+ * builtins that `@xenova/transformers` imports; `env.js` treats an empty object
+ * as “no filesystem” so it stays on browser / remote paths.
+ */
+const empty = Object.create(null) as Record<string, never>;
+export default empty;
diff --git a/src/lib/vite-stubs/onnxruntime-node-stub.ts b/src/lib/vite-stubs/onnxruntime-node-stub.ts
new file mode 100644
index 000000000..a70b3dd60
--- /dev/null
+++ b/src/lib/vite-stubs/onnxruntime-node-stub.ts
@@ -0,0 +1,10 @@
+/**
+ * Transformers always imports `onnxruntime-node`, then picks web vs node from `process.release.name`.
+ * In Electron's renderer that name is often `"node"` while we still must use the WASM build — the real
+ * `onnxruntime-node` package is aliased away (it pulls `fs`). Re-export `onnxruntime-web` here so the
+ * "node" branch still receives a working ORT with `registerBackend` etc.
+ */
+import * as ortWeb from "onnxruntime-web";
+
+const ort = (ortWeb as { default?: typeof ortWeb }).default ?? ortWeb;
+export default ort;
diff --git a/vite.config.ts b/vite.config.ts
index 55b55966b..86dd8e438 100644
--- a/vite.config.ts
+++ b/vite.config.ts
@@ -23,8 +23,17 @@ export default defineConfig({
 	resolve: {
 		alias: {
 			"@": path.resolve(__dirname, "src"),
+			// @xenova/transformers: env.js statically imports fs/path/url; onnx.js imports
+			// onnxruntime-node (must not be bundled in the renderer — it requires fs).
+			fs: path.resolve(__dirname, "src/lib/vite-stubs/empty-node-module.ts"),
+			path: path.resolve(__dirname, "src/lib/vite-stubs/empty-node-module.ts"),
+			url: path.resolve(__dirname, "src/lib/vite-stubs/empty-node-module.ts"),
+			"onnxruntime-node": path.resolve(__dirname, "src/lib/vite-stubs/onnxruntime-node-stub.ts"), // re-exports web ORT
 		},
 	},
+	optimizeDeps: {
+		exclude: ["@xenova/transformers"],
+	},
 	build: {
 		target: "esnext",
 		minify: "terser",

From 1c431fe47355139dc0a2f872b474ff502b6f0fa8 Mon Sep 17 00:00:00 2001
From: parse-nip <152457438+parse-nip@users.noreply.github.com>
Date: Thu, 14 May 2026 09:41:39 +0800
Subject: [PATCH 2/7] fix: improve auto-caption timing and pause handling

---
 src/components/video-editor/VideoEditor.tsx   |  41 +++-
 .../annotationsFromCaptions.test.ts           | 158 ++++++++++++++
 src/lib/captioning/annotationsFromCaptions.ts | 203 +++++++++++++-----
 src/lib/captioning/transcribe.ts              |  29 ++-
 4 files changed, 357 insertions(+), 74 deletions(-)
 create mode 100644 src/lib/captioning/annotationsFromCaptions.test.ts

diff --git a/src/components/video-editor/VideoEditor.tsx b/src/components/video-editor/VideoEditor.tsx
index 86ee26ba0..e96202ac9 100644
--- a/src/components/video-editor/VideoEditor.tsx
+++ b/src/components/video-editor/VideoEditor.tsx
@@ -1816,18 +1816,32 @@ export default function VideoEditor() {
 				const trimMs = Math.round(trimSec * 1000);
 				const trimRegionsForTranscribe = shiftTrimRegionsMsForCaptionBuffer(trimRegions, trimMs);
 
-				const { segments: segmentsRaw, granularity } = await transcribeMono16kToSegments(
+				const transcribeOptions = {
+					onStatus: (phase: "model" | "transcribe") => {
+						if (phase === "model") {
+							toast.loading(t("autoCaptions.loadingModel"), { id: toastId });
+						}
+					},
+				};
+
+				let { segments: segmentsRaw, granularity } = await transcribeMono16kToSegments(
 					speechSamples,
 					{
 						trimRegions: trimRegionsForTranscribe,
-						onStatus: (phase) => {
-							if (phase === "model") {
-								toast.loading(t("autoCaptions.loadingModel"), { id: toastId });
-							}
-						},
+						...transcribeOptions,
 					},
 				);
 
+				// Some recordings come back empty after leading-silence trimming even though the full
+				// source has recognizable speech. Retry once against the untouched audio buffer before
+				// giving up so we do not show "no speech detected" for a spoken clip.
+				if (segmentsRaw.length === 0 && trimSec > 0) {
+					({ segments: segmentsRaw, granularity } = await transcribeMono16kToSegments(samples, {
+						trimRegions,
+						...transcribeOptions,
+					}));
+				}
+
 				const segments =
 					trimSec > 0
 						? segmentsRaw.map((s) => ({
@@ -1837,7 +1851,7 @@ export default function VideoEditor() {
 							}))
 						: segmentsRaw;
 
-				const { regions, nextNumericId, nextZIndex } = captionSegmentsToAnnotationRegions(
+				let { regions, nextNumericId, nextZIndex } = captionSegmentsToAnnotationRegions(
 					segments,
 					nextAnnotationIdRef.current,
 					nextAnnotationZIndexRef.current,
@@ -1848,6 +1862,19 @@ export default function VideoEditor() {
 					},
 				);
 
+				if (regions.length === 0 && segments.length > 0) {
+					({ regions, nextNumericId, nextZIndex } = captionSegmentsToAnnotationRegions(
+						segments,
+						nextAnnotationIdRef.current,
+						nextAnnotationZIndexRef.current,
+						{
+							minWordsPerCaption: 1,
+							maxWordsPerCaption: Number.MAX_SAFE_INTEGER,
+							timestampGranularity: granularity,
+						},
+					));
+				}
+
 				if (regions.length === 0) {
 					toast.dismiss(toastId);
 					toast.info(t("autoCaptions.noneHeard"));
diff --git a/src/lib/captioning/annotationsFromCaptions.test.ts b/src/lib/captioning/annotationsFromCaptions.test.ts
new file mode 100644
index 000000000..ea2b3a7be
--- /dev/null
+++ b/src/lib/captioning/annotationsFromCaptions.test.ts
@@ -0,0 +1,158 @@
+import { describe, expect, it } from "vitest";
+
+import {
+	captionSegmentsToAnnotationRegions,
+	groupPhraseCaptionSegmentsIntoLines,
+	groupTimedCaptionWordsIntoLines,
+	reconcileAutoCaptionTimelineGaps,
+} from "./annotationsFromCaptions";
+
+describe("groupPhraseCaptionSegmentsIntoLines", () => {
+	it("preserves phrase boundaries when formatting phrase-timestamp captions", () => {
+		const lines = groupPhraseCaptionSegmentsIntoLines(
+			[
+				{ startSec: 0, endSec: 0.5, text: "alpha beta" },
+				{ startSec: 0.62, endSec: 1.6, text: "gamma delta" },
+			],
+			2,
+			2,
+		);
+
+		expect(lines).toHaveLength(2);
+		expect(lines[0]).toMatchObject({ text: "alpha beta", startSec: 0 });
+		expect(lines[1]).toMatchObject({ text: "gamma delta", startSec: 0.62 });
+		expect(lines[0]!.endSec).toBeLessThanOrEqual(0.62);
+	});
+
+	it("wraps a single phrase into multiple visual lines without inventing extra timing regions", () => {
+		const lines = groupPhraseCaptionSegmentsIntoLines(
+			[{ startSec: 0, endSec: 1, text: "alpha beta gamma delta" }],
+			2,
+			2,
+		);
+
+		expect(lines).toHaveLength(1);
+		expect(lines[0]).toMatchObject({
+			startSec: 0,
+			endSec: 1,
+			text: "alpha beta\ngamma delta",
+		});
+	});
+});
+
+describe("captionSegmentsToAnnotationRegions", () => {
+	it("uses raw phrase timing instead of shifting caption boundaries", () => {
+		const { regions } = captionSegmentsToAnnotationRegions(
+			[
+				{ startSec: 0, endSec: 0.5, text: "first second" },
+				{ startSec: 0.62, endSec: 1.2, text: "third fourth" },
+			],
+			1,
+			1,
+			{ minWordsPerCaption: 2, maxWordsPerCaption: 2, timestampGranularity: "phrase" },
+		);
+
+		expect(regions).toHaveLength(2);
+		expect(regions[0]).toMatchObject({ startMs: 0, endMs: 500 });
+		expect(regions[1]).toMatchObject({ startMs: 620, endMs: 1200 });
+	});
+
+	it("preserves empty timeline space when word timestamps contain a real pause", () => {
+		const lines = groupTimedCaptionWordsIntoLines(
+			[
+				{ startSec: 0, endSec: 0.12, text: "first" },
+				{ startSec: 0.13, endSec: 0.28, text: "caption" },
+				{ startSec: 0.7, endSec: 0.83, text: "second" },
+				{ startSec: 0.84, endSec: 0.98, text: "caption" },
+			],
+			2,
+			2,
+		);
+
+		expect(lines).toHaveLength(2);
+		expect(lines[0]).toMatchObject({ startSec: 0, endSec: 0.28, text: "first caption" });
+		expect(lines[1]).toMatchObject({ startSec: 0.7, endSec: 0.98, text: "second caption" });
+	});
+});
+
+describe("reconcileAutoCaptionTimelineGaps", () => {
+	it("does not change regions when the minimum enforced gap is zero", () => {
+		const regions = reconcileAutoCaptionTimelineGaps([
+			{
+				id: "annotation-1",
+				startMs: 0,
+				endMs: 120,
+				type: "text",
+				content: "one",
+				annotationSource: "auto-caption",
+				position: { x: 0, y: 0 },
+				size: { width: 10, height: 10 },
+				style: {
+					color: "#fff",
+					backgroundColor: "transparent",
+					fontSize: 24,
+					fontFamily: "Inter",
+					fontWeight: "normal",
+					fontStyle: "normal",
+					textDecoration: "none",
+					textAlign: "center",
+				},
+				zIndex: 1,
+			},
+			{
+				id: "manual-1",
+				startMs: 50,
+				endMs: 1000,
+				type: "text",
+				content: "manual",
+				position: { x: 10, y: 10 },
+				size: { width: 10, height: 10 },
+				style: {
+					color: "#fff",
+					backgroundColor: "transparent",
+					fontSize: 24,
+					fontFamily: "Inter",
+					fontWeight: "normal",
+					fontStyle: "normal",
+					textDecoration: "none",
+					textAlign: "center",
+				},
+				zIndex: 2,
+			},
+			{
+				id: "annotation-2",
+				startMs: 130,
+				endMs: 300,
+				type: "text",
+				content: "two",
+				annotationSource: "auto-caption",
+				position: { x: 0, y: 0 },
+				size: { width: 10, height: 10 },
+				style: {
+					color: "#fff",
+					backgroundColor: "transparent",
+					fontSize: 24,
+					fontFamily: "Inter",
+					fontWeight: "normal",
+					fontStyle: "normal",
+					textDecoration: "none",
+					textAlign: "center",
+				},
+				zIndex: 3,
+			},
+		]);
+
+		expect(regions.find((r) => r.id === "manual-1")).toMatchObject({
+			startMs: 50,
+			endMs: 1000,
+		});
+		expect(regions.find((r) => r.id === "annotation-1")).toMatchObject({
+			startMs: 0,
+			endMs: 120,
+		});
+		expect(regions.find((r) => r.id === "annotation-2")).toMatchObject({
+			startMs: 130,
+			endMs: 300,
+		});
+	});
+});
diff --git a/src/lib/captioning/annotationsFromCaptions.ts b/src/lib/captioning/annotationsFromCaptions.ts
index db26669a3..cfae163ee 100644
--- a/src/lib/captioning/annotationsFromCaptions.ts
+++ b/src/lib/captioning/annotationsFromCaptions.ts
@@ -5,7 +5,7 @@ import type { CaptionSegment } from "./transcribe";
 /** Wide lower-third bar; `position.x` is top-left as % of container, so center with (100 − width) / 2. */
 const CAPTION_WIDTH = 92;
 const CAPTION_HEIGHT = 12;
-const CAPTION_BOTTOM_MARGIN = 4;
+const CAPTION_BOTTOM_MARGIN = 2;
 
 const CAPTION_POSITION = {
 	x: (100 - CAPTION_WIDTH) / 2,
@@ -29,38 +29,33 @@ const CAPTION_STYLE: AnnotationTextStyle = {
  * Nudge caption **starts** earlier (seconds). Whisper onsets are often slightly late vs. what you
  * hear; do **not** apply the same offset to ends — that pulls lines off-screen too early.
  */
-const AUTO_CAPTION_START_BIAS_SEC = -0.2;
+const AUTO_CAPTION_START_BIAS_SEC = 0;
 
 /**
  * Extra time held after Whisper’s segment **end** (seconds). Model end times are often early vs.
  * trailing vowels / room tone; this is separate from `AUTO_CAPTION_START_BIAS_SEC`.
  */
-const AUTO_CAPTION_END_HOLD_SEC = 0.12;
-
-/** First phrases often sit a bit early in the model; delay only the first two timeline lines (seconds). */
-const FIRST_TWO_CAPTION_DELAY_SEC = 0.32;
+const AUTO_CAPTION_END_HOLD_SEC = 0;
 
 /** Inside one Whisper phrase, sub-lines can be shorter (do not steal time from neighbors). */
 const WORD_SPLIT_MIN_SPAN_SEC = 0.02;
 
 /** Brief linger after the last word in a line (seconds); trimmed if it would overlap the next line. */
-const CAPTION_LINE_END_TAIL_SEC = 0.12;
+const CAPTION_LINE_END_TAIL_SEC = 0;
+
+/** A real silence between word-level timestamps should start a new caption run. */
+const WORD_RUN_BREAK_GAP_SEC = 0.24;
 
 /**
  * Minimum time between consecutive caption regions on the timeline (seconds). Keeps a visible gap
  * so blocks do not read as one clip; kept small so we do not erase natural short pauses between phrases.
  */
-const MIN_CAPTION_TIMELINE_GAP_SEC = 0.024;
+const MIN_CAPTION_TIMELINE_GAP_SEC = 0;
 
 /** Same text again with almost no gap or overlap — common Whisper / chunk artifact. */
 const DEDUPE_SAME_TEXT_MAX_GAP_SEC = 0.55;
 
-/**
- * Same caption content re-emerging shortly after the last time that text appeared (stride /
- * decoding loops). Wider than `DEDUPE_SAME_TEXT_MAX_GAP_SEC` so non-adjacent duplicates still
- * collapse after grouping.
- */
-const SAME_CONTENT_ECHO_MAX_GAP_SEC = 1.15;
+export const SAME_CONTENT_ECHO_MAX_GAP_SEC = 1.15;
 
 function normalizeCaptionKey(text: string): string {
 	return text
@@ -72,8 +67,8 @@ function normalizeCaptionKey(text: string): string {
 		.replace(/[.!?,;:]+$/g, "");
 }
 
-/** Merges duplicate lines when the same wording appears again within `SAME_CONTENT_ECHO_MAX_GAP_SEC`. */
-function collapseSameContentEchoes(segments: CaptionSegment[]): CaptionSegment[] {
+/** Legacy echo-collapse helper kept for reference while phrase timing uses raw model spans. */
+export function collapseSameContentEchoes(segments: CaptionSegment[]): CaptionSegment[] {
 	const sorted = [...segments]
 		.filter((s) => s.text.trim())
 		.sort((a, b) => a.startSec - b.startSec || a.endSec - b.endSec);
@@ -105,12 +100,6 @@ function collapseSameContentEchoes(segments: CaptionSegment[]): CaptionSegment[]
  * Only merge segments that are almost back-to-back (Whisper often splits mid-phrase with a tiny gap).
  * Wider gaps are usually silence or missed audio — merging those stretches word timing across dead air.
  */
-const AUTO_CAPTION_MERGE_OPTIONS = {
-	maxGapSec: 0.16,
-	maxChars: 500,
-	maxBlockDurationSec: 5,
-} as const;
-
 /**
  * Collapse adjacent duplicate lines (overlapping or tiny gap). Does not merge the same phrase
  * repeated later in the video when separated by real silence.
@@ -137,11 +126,7 @@ function dedupeAdjacentCaptionRepeats(segments: CaptionSegment[]): CaptionSegmen
 	return out;
 }
 
-/**
- * Apply start bias + end hold, then trim only *real* overlaps (previous end into next start). No
- * minimum-duration stretching and no snapping starts — that was collapsing gaps and stacking lines
- * on the timeline.
- */
+/** Trim only real overlaps. Avoid synthetic lead/lag so caption timing matches model output. */
 function finalizeCaptionSegmentsForPlayback(segments: CaptionSegment[]): CaptionSegment[] {
 	const OVERLAP_TRIM_SEC = 0.002;
 
@@ -149,10 +134,9 @@ function finalizeCaptionSegmentsForPlayback(segments: CaptionSegment[]): Caption
 		.filter((s) => s.text.trim())
 		.sort((a, b) => a.startSec - b.startSec || a.endSec - b.endSec);
 
-	const a = sortedRaw.map((seg, i) => {
-		const earlyHold = i < 2 ? FIRST_TWO_CAPTION_DELAY_SEC : 0;
-		let s = seg.startSec + AUTO_CAPTION_START_BIAS_SEC + earlyHold;
-		let e = seg.endSec + AUTO_CAPTION_END_HOLD_SEC + earlyHold;
+	const a = sortedRaw.map((seg) => {
+		let s = seg.startSec + AUTO_CAPTION_START_BIAS_SEC;
+		let e = seg.endSec + AUTO_CAPTION_END_HOLD_SEC;
 		s = Math.max(0, s);
 		if (e <= s) e = s + 0.02;
 		return { startSec: s, endSec: e, text: seg.text.trim() };
@@ -164,14 +148,6 @@ function finalizeCaptionSegmentsForPlayback(segments: CaptionSegment[]): Caption
 		}
 	}
 
-	// Leave at least MIN_CAPTION_TIMELINE_GAP_SEC between lines (shorten previous end only).
-	for (let i = 1; i < a.length; i++) {
-		const needPrevEnd = a[i]!.startSec - MIN_CAPTION_TIMELINE_GAP_SEC;
-		if (a[i - 1]!.endSec > needPrevEnd) {
-			a[i - 1]!.endSec = Math.max(a[i - 1]!.startSec + WORD_SPLIT_MIN_SPAN_SEC, needPrevEnd);
-		}
-	}
-
 	return a;
 }
 
@@ -258,6 +234,54 @@ export function mergeAdjacentCaptionSegments(
 	return out;
 }
 
+function partitionPhraseCaptionSegments(
+	segments: CaptionSegment[],
+	options?: { maxGapSec?: number; maxChars?: number; maxBlockDurationSec?: number },
+): CaptionSegment[][] {
+	const maxGapSec = options?.maxGapSec ?? 0;
+	const maxChars = options?.maxChars ?? Number.POSITIVE_INFINITY;
+	const maxBlockDurationSec = options?.maxBlockDurationSec ?? Number.POSITIVE_INFINITY;
+
+	const sorted = [...segments]
+		.filter((s) => s.text.trim())
+		.sort((a, b) => a.startSec - b.startSec || a.endSec - b.endSec);
+	if (sorted.length === 0) return [];
+
+	const groups: CaptionSegment[][] = [];
+	let current: CaptionSegment[] = [];
+
+	for (const seg of sorted) {
+		const text = seg.text.trim();
+		if (!text) continue;
+
+		if (current.length === 0) {
+			current.push({ ...seg, text });
+			continue;
+		}
+
+		const prev = current[current.length - 1]!;
+		const groupStart = current[0]!.startSec;
+		const gap = seg.startSec - prev.endSec;
+		const currentChars = current.reduce((sum, item) => sum + item.text.length, 0);
+		const wouldChars = currentChars + 1 + text.length;
+		const wouldSpan = Math.max(prev.endSec, seg.endSec) - groupStart;
+
+		if (gap <= maxGapSec && wouldChars <= maxChars && wouldSpan <= maxBlockDurationSec) {
+			current.push({ ...seg, text });
+			continue;
+		}
+
+		groups.push(current);
+		current = [{ ...seg, text }];
+	}
+
+	if (current.length > 0) {
+		groups.push(current);
+	}
+
+	return groups;
+}
+
 export interface CaptionSegmentLayoutOptions {
 	/** Lower bound on words per on-screen caption (default 2). */
 	minWordsPerCaption?: number;
@@ -324,19 +348,37 @@ export function groupTimedCaptionWordsIntoLines(
 
 	const minW = Math.max(1, Math.min(Math.floor(minWords), Math.floor(maxWords)));
 	const maxW = Math.max(minW, Math.floor(maxWords));
-	const ranges = computeCaptionLineIndexRanges(words.length, minW, maxW);
 	const out: CaptionSegment[] = [];
-	for (const { from, to } of ranges) {
-		const slice = words.slice(from, to);
-		const s = slice[0]!.startSec;
-		const rawEnd = slice[slice.length - 1]!.endSec;
-		const e = Math.max(s + WORD_SPLIT_MIN_SPAN_SEC, rawEnd + CAPTION_LINE_END_TAIL_SEC);
-		out.push({
-			startSec: s,
-			endSec: e,
-			text: slice.map((w) => w.text.trim()).join(" "),
-		});
+
+	let runStart = 0;
+	const flushRun = (runEndExclusive: number) => {
+		const run = words.slice(runStart, runEndExclusive);
+		if (run.length === 0) return;
+		const ranges = computeCaptionLineIndexRanges(run.length, minW, maxW);
+		for (const { from, to } of ranges) {
+			const slice = run.slice(from, to);
+			const s = slice[0]!.startSec;
+			const rawEnd = slice[slice.length - 1]!.endSec;
+			const e = Math.max(s + WORD_SPLIT_MIN_SPAN_SEC, rawEnd + CAPTION_LINE_END_TAIL_SEC);
+			out.push({
+				startSec: s,
+				endSec: e,
+				text: slice.map((w) => w.text.trim()).join(" "),
+			});
+		}
+	};
+
+	for (let i = 1; i < words.length; i++) {
+		const prev = words[i - 1]!;
+		const cur = words[i]!;
+		const gap = cur.startSec - prev.endSec;
+		if (gap >= WORD_RUN_BREAK_GAP_SEC) {
+			flushRun(i);
+			runStart = i;
+		}
 	}
+	flushRun(words.length);
+
 	for (let i = 0; i < out.length - 1; i++) {
 		if (out[i]!.endSec > out[i + 1]!.startSec + 1e-3) {
 			out[i]!.endSec = Math.max(
@@ -380,6 +422,58 @@ export function splitMergedCaptionsByWordBounds(
 	return out;
 }
 
+function wrapCaptionTextByWordBounds(text: string, minWords: number, maxWords: number): string {
+	const words = text.trim().split(/\s+/).filter(Boolean);
+	if (words.length === 0) return "";
+	const minW = Math.max(1, Math.min(Math.floor(minWords), Math.floor(maxWords)));
+	const maxW = Math.max(minW, Math.floor(maxWords));
+	const ranges = computeCaptionLineIndexRanges(words.length, minW, maxW);
+	return ranges.map(({ from, to }) => words.slice(from, to).join(" ")).join("\n");
+}
+
+function expandPhraseSegmentToPseudoWords(segment: CaptionSegment): CaptionSegment[] {
+	const words = segment.text.trim().split(/\s+/).filter(Boolean);
+	if (words.length === 0) return [];
+	if (words.length === 1) {
+		return [
+			{
+				startSec: segment.startSec,
+				endSec: segment.endSec,
+				text: words[0]!,
+			},
+		];
+	}
+
+	return splitOneSegmentByWordBounds(segment.startSec, segment.endSec, words, 1, 1);
+}
+
+export function groupPhraseCaptionSegmentsIntoLines(
+	segments: CaptionSegment[],
+	minWords: number,
+	maxWords: number,
+	options?: { maxGapSec?: number; maxChars?: number; maxBlockDurationSec?: number },
+): CaptionSegment[] {
+	const groups = partitionPhraseCaptionSegments(segments, options);
+	const out: CaptionSegment[] = [];
+
+	for (const group of groups) {
+		if (group.length === 1) {
+			const only = group[0]!;
+			out.push({
+				startSec: only.startSec,
+				endSec: only.endSec,
+				text: wrapCaptionTextByWordBounds(only.text, minWords, maxWords),
+			});
+			continue;
+		}
+
+		const pseudoWords = group.flatMap(expandPhraseSegmentToPseudoWords);
+		out.push(...groupTimedCaptionWordsIntoLines(pseudoWords, minWords, maxWords));
+	}
+
+	return out;
+}
+
 function splitOneSegmentByWordBounds(
 	startSec: number,
 	endSec: number,
@@ -445,16 +539,11 @@ export function captionSegmentsToAnnotationRegions(
 
 	const grouped =
 		granularity === "phrase"
-			? splitMergedCaptionsByWordBounds(
-					mergeAdjacentCaptionSegments(dedupedIn, AUTO_CAPTION_MERGE_OPTIONS),
-					minW,
-					maxW,
-				)
+			? groupPhraseCaptionSegmentsIntoLines(dedupedIn, minW, maxW)
 			: groupTimedCaptionWordsIntoLines(dedupedIn, minW, maxW);
 
 	const dedupedOut = dedupeAdjacentCaptionRepeats(grouped);
-	const rinsedOut = collapseSameContentEchoes(dedupedOut);
-	const finalized = finalizeCaptionSegmentsForPlayback(rinsedOut);
+	const finalized = finalizeCaptionSegmentsForPlayback(dedupedOut);
 
 	let nid = startNumericId;
 	let z = startZIndex;
diff --git a/src/lib/captioning/transcribe.ts b/src/lib/captioning/transcribe.ts
index 96b235ffc..e3c854cff 100644
--- a/src/lib/captioning/transcribe.ts
+++ b/src/lib/captioning/transcribe.ts
@@ -140,13 +140,13 @@ function segmentsFromTranscriberChunks(
 async function runTranscriberOnSlice(
 	transcriber: (audio: Float32Array, opts: Record<string, unknown>) => Promise<unknown>,
 	samples: Float32Array,
-	opts: { forceFullSequences: boolean },
+	opts: { forceFullSequences: boolean; timestampMode: "word" | "phrase" },
 ): Promise<unknown> {
 	const durationSec = samples.length / 16_000;
 	// Only chunk long clips; short-audio chunking regressed some Whisper.js runs (empty chunks).
 	const chunking = durationSec > 30 ? { chunk_length_s: 30, stride_length_s: 5 } : {};
 	return transcriber(samples, {
-		return_timestamps: true,
+		return_timestamps: opts.timestampMode === "word" ? "word" : true,
 		force_full_sequences: opts.forceFullSequences,
 		...chunking,
 	});
@@ -219,6 +219,7 @@ export async function transcribeMono16kToSegments(
 		const transcribeOne = async (
 			ignoreTrims: boolean,
 			forceFullSequences: boolean,
+			timestampMode: "word" | "phrase",
 		): Promise<CaptionSegment[]> => {
 			try {
 				const activeTrims = ignoreTrims ? [] : trims;
@@ -226,6 +227,7 @@ export async function transcribeMono16kToSegments(
 					const { slice, realDurationSec } = padTailSliceForTranscribe(samples);
 					const result = await runTranscriberOnSlice(transcriber, slice, {
 						forceFullSequences,
+						timestampMode,
 					});
 					return segmentsFromTranscriberChunks(
 						extractChunksFromAsrResult(result),
@@ -251,6 +253,7 @@ export async function transcribeMono16kToSegments(
 
 					const result = await runTranscriberOnSlice(transcriber, slice, {
 						forceFullSequences,
+						timestampMode,
 					});
 					const tOff = offset / 16_000;
 					all.push(
@@ -270,17 +273,23 @@ export async function transcribeMono16kToSegments(
 			}
 		};
 
-		let segments = await transcribeOne(false, true);
-		if (segments.length === 0) {
-			segments = await transcribeOne(false, false);
-		}
-		if (segments.length === 0 && trims.length > 0) {
-			segments = await transcribeOne(true, true);
+		const attemptModes: Array<"word" | "phrase"> = ["word", "phrase"];
+		for (const timestampMode of attemptModes) {
+			let segments = await transcribeOne(false, true, timestampMode);
 			if (segments.length === 0) {
-				segments = await transcribeOne(true, false);
+				segments = await transcribeOne(false, false, timestampMode);
+			}
+			if (segments.length === 0 && trims.length > 0) {
+				segments = await transcribeOne(true, true, timestampMode);
+				if (segments.length === 0) {
+					segments = await transcribeOne(true, false, timestampMode);
+				}
+			}
+			if (segments.length > 0) {
+				return { segments, granularity: timestampMode };
 			}
 		}
 
-		return { segments, granularity: "phrase" };
+		return { segments: [], granularity: "phrase" };
 	});
 }

From 2b7e6f1677dbae7a2635024a1273765b95107e0f Mon Sep 17 00:00:00 2001
From: parse-nip <152457438+parse-nip@users.noreply.github.com>
Date: Thu, 14 May 2026 10:39:08 +0800
Subject: [PATCH 3/7] fix: address PR review for auto captions and locales

- Gate trimSec segment shift when transcription used full buffer retry
- Restore UTF-8 autoCaptions strings (ar, es, fr, ja-JP, ko-KR, tr, zh-CN, zh-TW)
- Dedupe caption segments after grouping only; stricter chunk dedupe in transcribe
- Post-truncate duration, explicit consume merge for web demuxer, trim region shift cleanup
- Tests for caption annotation pipeline

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/components/video-editor/VideoEditor.tsx   |  9 ++++--
 src/i18n/locales/ar/editor.json               | 32 +++++++++----------
 src/i18n/locales/es/editor.json               | 32 +++++++++----------
 src/i18n/locales/fr/editor.json               | 32 +++++++++----------
 src/i18n/locales/ja-JP/editor.json            | 32 +++++++++----------
 src/i18n/locales/ko-KR/editor.json            | 32 +++++++++----------
 src/i18n/locales/tr/editor.json               | 32 +++++++++----------
 src/i18n/locales/zh-CN/editor.json            | 32 +++++++++----------
 src/i18n/locales/zh-TW/editor.json            | 32 +++++++++----------
 .../annotationsFromCaptions.test.ts           | 15 +++++++++
 src/lib/captioning/annotationsFromCaptions.ts |  5 ++-
 src/lib/captioning/extractMono16k.ts          |  2 +-
 .../captioning/extractMono16kWebDemuxer.ts    |  6 ++--
 src/lib/captioning/leadingSilence.ts          |  9 ++----
 src/lib/captioning/transcribe.ts              |  3 +-
 15 files changed, 159 insertions(+), 146 deletions(-)

diff --git a/src/components/video-editor/VideoEditor.tsx b/src/components/video-editor/VideoEditor.tsx
index e96202ac9..2edf90dc9 100644
--- a/src/components/video-editor/VideoEditor.tsx
+++ b/src/components/video-editor/VideoEditor.tsx
@@ -1022,6 +1022,9 @@ export default function VideoEditor() {
 	const handleAnnotationSpanChange = useCallback(
 		(id: string, span: Span) => {
 			pushState((prev) => {
+				const editedAutoCaption =
+					prev.annotationRegions.find((region) => region.id === id)?.annotationSource ===
+					"auto-caption";
 				const next = prev.annotationRegions.map((region) =>
 					region.id === id
 						? {
@@ -1032,7 +1035,7 @@ export default function VideoEditor() {
 						: region,
 				);
 				return {
-					annotationRegions: reconcileAutoCaptionTimelineGaps(next),
+					annotationRegions: editedAutoCaption ? reconcileAutoCaptionTimelineGaps(next) : next,
 				};
 			});
 		},
@@ -1831,6 +1834,7 @@ export default function VideoEditor() {
 						...transcribeOptions,
 					},
 				);
+				let transcribedFromTrimmedBuffer = true;
 
 				// Some recordings come back empty after leading-silence trimming even though the full
 				// source has recognizable speech. Retry once against the untouched audio buffer before
@@ -1840,10 +1844,11 @@ export default function VideoEditor() {
 						trimRegions,
 						...transcribeOptions,
 					}));
+					transcribedFromTrimmedBuffer = false;
 				}
 
 				const segments =
-					trimSec > 0
+					transcribedFromTrimmedBuffer && trimSec > 0
 						? segmentsRaw.map((s) => ({
 								...s,
 								startSec: s.startSec + trimSec,
diff --git a/src/i18n/locales/ar/editor.json b/src/i18n/locales/ar/editor.json
index d0f99ed60..35ba08b23 100644
--- a/src/i18n/locales/ar/editor.json
+++ b/src/i18n/locales/ar/editor.json
@@ -43,21 +43,21 @@
 		"permissionDenied": "تم رفض إذن التسجيل. يرجى السماح بتسجيل الشاشة."
 	},
 	"autoCaptions": {
-		"button": "Auto captions",
-		"dialogTitle": "Auto captions",
-		"dialogDescription": "Choose roughly how many words each caption shows at once. Timing is spread across the words in that phrase.",
-		"minWords": "Minimum words per caption",
-		"maxWords": "Maximum words per caption",
-		"wordsCount": "{{count}} words",
-		"generate": "Generate",
-		"dialogCancel": "Cancel",
-		"generating": "Generating captions from audio…",
-		"loadingModel": "Loading speech model (first use downloads ~75 MB)…",
-		"busy": "Caption generation is already in progress.",
-		"done": "Added {{count}} captions.",
-		"noneHeard": "No speech was detected.",
-		"noAudio": "This video has no usable audio to transcribe.",
-		"failed": "Could not generate captions.",
-		"truncated": "Only the first {{minutes}} minutes were transcribed."
+		"button": "التسميات التوضيحية التلقائية",
+		"dialogTitle": "التسميات التوضيحية التلقائية",
+		"dialogDescription": "اختر تقريبا كم عدد الكلمات التي تظهر في كل تسمية توضيحية. يتم توزيع التوقيت عبر الكلمات في تلك العبارة.",
+		"minWords": "الحد الأدنى من الكلمات لكل تسمية",
+		"maxWords": "الحد الأقصى من الكلمات لكل تسمية",
+		"wordsCount": "{{count}} كلمة",
+		"generate": "توليد",
+		"dialogCancel": "إلغاء",
+		"generating": "جارٍ توليد التسميات من الصوت…",
+		"loadingModel": "جارٍ تحميل نموذج الكلام (سيتم تنزيل ~75 ميغابايت عند الاستخدام الأول)…",
+		"busy": "توليد التسميات قيد التنفيذ بالفعل.",
+		"done": "تمت إضافة {{count}} تسمية.",
+		"noneHeard": "لم يتم الكشف عن أي كلام.",
+		"noAudio": "لا يحتوي هذا الفيديو على صوت صالح للنسخ.",
+		"failed": "تعذّر توليد التسميات.",
+		"truncated": "تم نسخ الدقائق الأولى فقط: {{minutes}} دقيقة."
 	}
 }
diff --git a/src/i18n/locales/es/editor.json b/src/i18n/locales/es/editor.json
index 64faff9e7..c7a2c2aff 100644
--- a/src/i18n/locales/es/editor.json
+++ b/src/i18n/locales/es/editor.json
@@ -43,21 +43,21 @@
 		"confirm": "Confirmar"
 	},
 	"autoCaptions": {
-		"button": "Auto captions",
-		"dialogTitle": "Auto captions",
-		"dialogDescription": "Choose roughly how many words each caption shows at once. Timing is spread across the words in that phrase.",
-		"minWords": "Minimum words per caption",
-		"maxWords": "Maximum words per caption",
-		"wordsCount": "{{count}} words",
-		"generate": "Generate",
-		"dialogCancel": "Cancel",
-		"generating": "Generating captions from audio…",
-		"loadingModel": "Loading speech model (first use downloads ~75 MB)…",
-		"busy": "Caption generation is already in progress.",
-		"done": "Added {{count}} captions.",
-		"noneHeard": "No speech was detected.",
-		"noAudio": "This video has no usable audio to transcribe.",
-		"failed": "Could not generate captions.",
-		"truncated": "Only the first {{minutes}} minutes were transcribed."
+		"button": "Subtítulos automáticos",
+		"dialogTitle": "Subtítulos automáticos",
+		"dialogDescription": "Elige aproximadamente cuántas palabras muestra cada subtítulo a la vez. El tiempo se reparte entre las palabras de esa frase.",
+		"minWords": "Número mínimo de palabras por subtítulo",
+		"maxWords": "Número máximo de palabras por subtítulo",
+		"wordsCount": "{{count}} palabras",
+		"generate": "Generar",
+		"dialogCancel": "Cancelar",
+		"generating": "Generando subtítulos a partir del audio…",
+		"loadingModel": "Cargando el modelo de voz (el primer uso descarga ~75 MB)…",
+		"busy": "La generación de subtítulos ya está en curso.",
+		"done": "Se añadieron {{count}} subtítulos.",
+		"noneHeard": "No se detectó voz.",
+		"noAudio": "Este video no tiene audio utilizable para transcribir.",
+		"failed": "No se pudieron generar los subtítulos.",
+		"truncated": "Solo se transcribieron los primeros {{minutes}} minutos."
 	}
 }
diff --git a/src/i18n/locales/fr/editor.json b/src/i18n/locales/fr/editor.json
index 4137a7035..e1acdc741 100644
--- a/src/i18n/locales/fr/editor.json
+++ b/src/i18n/locales/fr/editor.json
@@ -43,21 +43,21 @@
 	},
 	"loadingVideo": "Chargement de la vidéo...",
 	"autoCaptions": {
-		"button": "Auto captions",
-		"dialogTitle": "Auto captions",
-		"dialogDescription": "Choose roughly how many words each caption shows at once. Timing is spread across the words in that phrase.",
-		"minWords": "Minimum words per caption",
-		"maxWords": "Maximum words per caption",
-		"wordsCount": "{{count}} words",
-		"generate": "Generate",
-		"dialogCancel": "Cancel",
-		"generating": "Generating captions from audio…",
-		"loadingModel": "Loading speech model (first use downloads ~75 MB)…",
-		"busy": "Caption generation is already in progress.",
-		"done": "Added {{count}} captions.",
-		"noneHeard": "No speech was detected.",
-		"noAudio": "This video has no usable audio to transcribe.",
-		"failed": "Could not generate captions.",
-		"truncated": "Only the first {{minutes}} minutes were transcribed."
+		"button": "Sous-titres automatiques",
+		"dialogTitle": "Sous-titres automatiques",
+		"dialogDescription": "Choisissez approximativement combien de mots chaque sous-titre affiche à la fois. Le timing est réparti entre les mots de cette phrase.",
+		"minWords": "Nombre minimum de mots par sous-titre",
+		"maxWords": "Nombre maximum de mots par sous-titre",
+		"wordsCount": "{{count}} mots",
+		"generate": "Générer",
+		"dialogCancel": "Annuler",
+		"generating": "Génération des sous-titres à partir de l'audio…",
+		"loadingModel": "Chargement du modèle vocal (le premier usage télécharge ~75 MB)…",
+		"busy": "La génération des sous-titres est déjà en cours.",
+		"done": "{{count}} sous-titres ajoutés.",
+		"noneHeard": "Aucune parole n'a été détectée.",
+		"noAudio": "Cette vidéo ne contient pas d'audio exploitable pour la transcription.",
+		"failed": "Impossible de générer les sous-titres.",
+		"truncated": "Seules les {{minutes}} premières minutes ont été transcrites."
 	}
 }
diff --git a/src/i18n/locales/ja-JP/editor.json b/src/i18n/locales/ja-JP/editor.json
index e488dadc8..fb755db6a 100644
--- a/src/i18n/locales/ja-JP/editor.json
+++ b/src/i18n/locales/ja-JP/editor.json
@@ -43,21 +43,21 @@
 		"cameraNotFound": "カメラが見つかりません。"
 	},
 	"autoCaptions": {
-		"button": "Auto captions",
-		"dialogTitle": "Auto captions",
-		"dialogDescription": "Choose roughly how many words each caption shows at once. Timing is spread across the words in that phrase.",
-		"minWords": "Minimum words per caption",
-		"maxWords": "Maximum words per caption",
-		"wordsCount": "{{count}} words",
-		"generate": "Generate",
-		"dialogCancel": "Cancel",
-		"generating": "Generating captions from audio…",
-		"loadingModel": "Loading speech model (first use downloads ~75 MB)…",
-		"busy": "Caption generation is already in progress.",
-		"done": "Added {{count}} captions.",
-		"noneHeard": "No speech was detected.",
-		"noAudio": "This video has no usable audio to transcribe.",
-		"failed": "Could not generate captions.",
-		"truncated": "Only the first {{minutes}} minutes were transcribed."
+		"button": "自動キャプション",
+		"dialogTitle": "自動キャプション",
+		"dialogDescription": "各キャプションに一度に表示する語数の目安を選びます。タイミングはそのフレーズ内の語に分配されます。",
+		"minWords": "キャプションあたりの最小語数",
+		"maxWords": "キャプションあたりの最大語数",
+		"wordsCount": "{{count}} 語",
+		"generate": "生成",
+		"dialogCancel": "キャンセル",
+		"generating": "音声からキャプションを生成しています…",
+		"loadingModel": "音声モデルを読み込んでいます（初回利用時は約 75 MB をダウンロードします）…",
+		"busy": "キャプションの生成はすでに実行中です。",
+		"done": "{{count}} 件のキャプションを追加しました。",
+		"noneHeard": "音声が検出されませんでした。",
+		"noAudio": "この動画には書き起こしに使える音声がありません。",
+		"failed": "キャプションを生成できませんでした。",
+		"truncated": "最初の {{minutes}} 分のみが書き起こされました。"
 	}
 }
diff --git a/src/i18n/locales/ko-KR/editor.json b/src/i18n/locales/ko-KR/editor.json
index 2c57b9c13..51712b22f 100644
--- a/src/i18n/locales/ko-KR/editor.json
+++ b/src/i18n/locales/ko-KR/editor.json
@@ -43,21 +43,21 @@
 		"cameraNotFound": "카메라를 찾을 수 없습니다."
 	},
 	"autoCaptions": {
-		"button": "Auto captions",
-		"dialogTitle": "Auto captions",
-		"dialogDescription": "Choose roughly how many words each caption shows at once. Timing is spread across the words in that phrase.",
-		"minWords": "Minimum words per caption",
-		"maxWords": "Maximum words per caption",
-		"wordsCount": "{{count}} words",
-		"generate": "Generate",
-		"dialogCancel": "Cancel",
-		"generating": "Generating captions from audio…",
-		"loadingModel": "Loading speech model (first use downloads ~75 MB)…",
-		"busy": "Caption generation is already in progress.",
-		"done": "Added {{count}} captions.",
-		"noneHeard": "No speech was detected.",
-		"noAudio": "This video has no usable audio to transcribe.",
-		"failed": "Could not generate captions.",
-		"truncated": "Only the first {{minutes}} minutes were transcribed."
+		"button": "자동 자막",
+		"dialogTitle": "자동 자막",
+		"dialogDescription": "각 자막에 한 번에 표시할 단어 수의 대략적인 값을 선택하세요. 타이밍은 해당 구문의 단어들에 나뉩니다.",
+		"minWords": "자막당 최소 단어 수",
+		"maxWords": "자막당 최대 단어 수",
+		"wordsCount": "{{count}}개 단어",
+		"generate": "생성",
+		"dialogCancel": "취소",
+		"generating": "오디오에서 자막을 생성하는 중…",
+		"loadingModel": "음성 모델을 불러오는 중(첫 사용 시 약 75MB 다운로드)…",
+		"busy": "자막 생성이 이미 진행 중입니다.",
+		"done": "자막 {{count}}개를 추가했습니다.",
+		"noneHeard": "음성이 감지되지 않았습니다.",
+		"noAudio": "이 동영상에는 전사에 사용할 수 있는 음성이 없습니다.",
+		"failed": "자막을 생성할 수 없습니다.",
+		"truncated": "처음 {{minutes}}분만 전사되었습니다."
 	}
 }
diff --git a/src/i18n/locales/tr/editor.json b/src/i18n/locales/tr/editor.json
index e5946a9a0..e85ae5ebe 100644
--- a/src/i18n/locales/tr/editor.json
+++ b/src/i18n/locales/tr/editor.json
@@ -43,21 +43,21 @@
 		"confirm": "Onayla"
 	},
 	"autoCaptions": {
-		"button": "Auto captions",
-		"dialogTitle": "Auto captions",
-		"dialogDescription": "Choose roughly how many words each caption shows at once. Timing is spread across the words in that phrase.",
-		"minWords": "Minimum words per caption",
-		"maxWords": "Maximum words per caption",
-		"wordsCount": "{{count}} words",
-		"generate": "Generate",
-		"dialogCancel": "Cancel",
-		"generating": "Generating captions from audio…",
-		"loadingModel": "Loading speech model (first use downloads ~75 MB)…",
-		"busy": "Caption generation is already in progress.",
-		"done": "Added {{count}} captions.",
-		"noneHeard": "No speech was detected.",
-		"noAudio": "This video has no usable audio to transcribe.",
-		"failed": "Could not generate captions.",
-		"truncated": "Only the first {{minutes}} minutes were transcribed."
+		"button": "Otomatik altyazılar",
+		"dialogTitle": "Otomatik altyazılar",
+		"dialogDescription": "Her altyazının aynı anda yaklaşık kaç kelime göstermesini istediğinizi seçin. Zamanlama, o ifadedeki kelimelere dağıtılır.",
+		"minWords": "Altyazı başına en az kelime",
+		"maxWords": "Altyazı başına en fazla kelime",
+		"wordsCount": "{{count}} kelime",
+		"generate": "Oluştur",
+		"dialogCancel": "İptal",
+		"generating": "Sesten altyazılar oluşturuluyor…",
+		"loadingModel": "Konuşma modeli yükleniyor (ilk kullanımda ~75 MB indirilir)…",
+		"busy": "Altyazı oluşturma zaten devam ediyor.",
+		"done": "{{count}} altyazı eklendi.",
+		"noneHeard": "Konuşma algılanmadı.",
+		"noAudio": "Bu videoda yazıya dökülebilecek kullanılabilir bir ses yok.",
+		"failed": "Altyazılar oluşturulamadı.",
+		"truncated": "Yalnızca ilk {{minutes}} dakika yazıya döküldü."
 	}
 }
diff --git a/src/i18n/locales/zh-CN/editor.json b/src/i18n/locales/zh-CN/editor.json
index ca48fa6df..4f576f4c4 100644
--- a/src/i18n/locales/zh-CN/editor.json
+++ b/src/i18n/locales/zh-CN/editor.json
@@ -43,21 +43,21 @@
 		"permissionDenied": "录屏权限被拒绝。请允许屏幕录制。"
 	},
 	"autoCaptions": {
-		"button": "Auto captions",
-		"dialogTitle": "Auto captions",
-		"dialogDescription": "Choose roughly how many words each caption shows at once. Timing is spread across the words in that phrase.",
-		"minWords": "Minimum words per caption",
-		"maxWords": "Maximum words per caption",
-		"wordsCount": "{{count}} words",
-		"generate": "Generate",
-		"dialogCancel": "Cancel",
-		"generating": "Generating captions from audio…",
-		"loadingModel": "Loading speech model (first use downloads ~75 MB)…",
-		"busy": "Caption generation is already in progress.",
-		"done": "Added {{count}} captions.",
-		"noneHeard": "No speech was detected.",
-		"noAudio": "This video has no usable audio to transcribe.",
-		"failed": "Could not generate captions.",
-		"truncated": "Only the first {{minutes}} minutes were transcribed."
+		"button": "自动字幕",
+		"dialogTitle": "自动字幕",
+		"dialogDescription": "大致选择每条字幕一次显示多少个字词。时间会在该语句内的字词之间分配。",
+		"minWords": "每条字幕的最少字数",
+		"maxWords": "每条字幕的最多字数",
+		"wordsCount": "{{count}} 个词",
+		"generate": "生成",
+		"dialogCancel": "取消",
+		"generating": "正在从音频生成字幕…",
+		"loadingModel": "正在加载语音模型（首次使用将下载约 75 MB）…",
+		"busy": "字幕生成已在进行中。",
+		"done": "已添加 {{count}} 条字幕。",
+		"noneHeard": "未检测到语音。",
+		"noAudio": "此视频没有可用于转写的音频。",
+		"failed": "无法生成字幕。",
+		"truncated": "仅转写了最前 {{minutes}} 分钟。"
 	}
 }
diff --git a/src/i18n/locales/zh-TW/editor.json b/src/i18n/locales/zh-TW/editor.json
index 2ba567499..b61f408eb 100644
--- a/src/i18n/locales/zh-TW/editor.json
+++ b/src/i18n/locales/zh-TW/editor.json
@@ -43,21 +43,21 @@
 		"cameraNotFound": "找不到攝影機。"
 	},
 	"autoCaptions": {
-		"button": "Auto captions",
-		"dialogTitle": "Auto captions",
-		"dialogDescription": "Choose roughly how many words each caption shows at once. Timing is spread across the words in that phrase.",
-		"minWords": "Minimum words per caption",
-		"maxWords": "Maximum words per caption",
-		"wordsCount": "{{count}} words",
-		"generate": "Generate",
-		"dialogCancel": "Cancel",
-		"generating": "Generating captions from audio…",
-		"loadingModel": "Loading speech model (first use downloads ~75 MB)…",
-		"busy": "Caption generation is already in progress.",
-		"done": "Added {{count}} captions.",
-		"noneHeard": "No speech was detected.",
-		"noAudio": "This video has no usable audio to transcribe.",
-		"failed": "Could not generate captions.",
-		"truncated": "Only the first {{minutes}} minutes were transcribed."
+		"button": "自動字幕",
+		"dialogTitle": "自動字幕",
+		"dialogDescription": "大致選擇每條字幕一次顯示多少字詞。時間會在該語句內的字詞之間分配。",
+		"minWords": "每條字幕的最少字數",
+		"maxWords": "每條字幕的最多字數",
+		"wordsCount": "{{count}} 個詞",
+		"generate": "產生",
+		"dialogCancel": "取消",
+		"generating": "正在從音訊產生字幕…",
+		"loadingModel": "正在載入語音模型（首次使用將下載約 75 MB）…",
+		"busy": "字幕產生已在進行中。",
+		"done": "已新增 {{count}} 條字幕。",
+		"noneHeard": "未偵測到語音。",
+		"noAudio": "此影片沒有可用於轉寫的音訊。",
+		"failed": "無法產生字幕。",
+		"truncated": "僅轉寫了最前 {{minutes}} 分鐘。"
 	}
 }
diff --git a/src/lib/captioning/annotationsFromCaptions.test.ts b/src/lib/captioning/annotationsFromCaptions.test.ts
index ea2b3a7be..31679f7f5 100644
--- a/src/lib/captioning/annotationsFromCaptions.test.ts
+++ b/src/lib/captioning/annotationsFromCaptions.test.ts
@@ -73,6 +73,21 @@ describe("captionSegmentsToAnnotationRegions", () => {
 		expect(lines[0]).toMatchObject({ startSec: 0, endSec: 0.28, text: "first caption" });
 		expect(lines[1]).toMatchObject({ startSec: 0.7, endSec: 0.98, text: "second caption" });
 	});
+
+	it("preserves repeated words before grouping in word mode", () => {
+		const { regions } = captionSegmentsToAnnotationRegions(
+			[
+				{ startSec: 0, endSec: 0.12, text: "I" },
+				{ startSec: 0.13, endSec: 0.25, text: "I" },
+			],
+			1,
+			1,
+			{ minWordsPerCaption: 2, maxWordsPerCaption: 2, timestampGranularity: "word" },
+		);
+
+		expect(regions).toHaveLength(1);
+		expect(regions[0]).toMatchObject({ content: "I I" });
+	});
 });
 
 describe("reconcileAutoCaptionTimelineGaps", () => {
diff --git a/src/lib/captioning/annotationsFromCaptions.ts b/src/lib/captioning/annotationsFromCaptions.ts
index cfae163ee..ca82b0477 100644
--- a/src/lib/captioning/annotationsFromCaptions.ts
+++ b/src/lib/captioning/annotationsFromCaptions.ts
@@ -532,15 +532,14 @@ export function captionSegmentsToAnnotationRegions(
 ): { regions: AnnotationRegion[]; nextNumericId: number; nextZIndex: number } {
 	// Do not echo-collapse raw word tokens before grouping: repeated words ("I … I") share a
 	// normalized key and would merge spans while keeping only the first token's text.
-	const dedupedIn = dedupeAdjacentCaptionRepeats(segments);
 	const minW = layout?.minWordsPerCaption ?? 2;
 	const maxW = layout?.maxWordsPerCaption ?? 7;
 	const granularity = layout?.timestampGranularity ?? "word";
 
 	const grouped =
 		granularity === "phrase"
-			? groupPhraseCaptionSegmentsIntoLines(dedupedIn, minW, maxW)
-			: groupTimedCaptionWordsIntoLines(dedupedIn, minW, maxW);
+			? groupPhraseCaptionSegmentsIntoLines(segments, minW, maxW)
+			: groupTimedCaptionWordsIntoLines(segments, minW, maxW);
 
 	const dedupedOut = dedupeAdjacentCaptionRepeats(grouped);
 	const finalized = finalizeCaptionSegmentsForPlayback(dedupedOut);
diff --git a/src/lib/captioning/extractMono16k.ts b/src/lib/captioning/extractMono16k.ts
index 9e932ea3d..53258567c 100644
--- a/src/lib/captioning/extractMono16k.ts
+++ b/src/lib/captioning/extractMono16k.ts
@@ -99,7 +99,7 @@ async function truncateAndResampleTo16k(
 	}
 
 	const samples = await resampleMono(work, fromRate, 16_000, signal);
-	return { samples, truncated, durationSec };
+	return { samples, truncated, durationSec: samples.length / 16_000 };
 }
 
 /**
diff --git a/src/lib/captioning/extractMono16kWebDemuxer.ts b/src/lib/captioning/extractMono16kWebDemuxer.ts
index 4898ca011..fd85f5703 100644
--- a/src/lib/captioning/extractMono16kWebDemuxer.ts
+++ b/src/lib/captioning/extractMono16kWebDemuxer.ts
@@ -42,7 +42,7 @@ function audioDataFrameToMono(frame: AudioData): Float32Array {
 	return out;
 }
 
-function mergeDecodedAudioToMonoLinear(
+function mergeAndConsumeDecodedAudioToMonoLinear(
 	frames: AudioData[],
 	sampleRate: number,
 	durationSec: number,
@@ -129,7 +129,7 @@ export async function extractMonoPcmViaWebDemuxer(
 	const sampleRate = audioConfig.sampleRate || 48_000;
 
 	// Many WebM/Matroska files report a too-short duration; capping read at reported time stops
-	// demux early and mergeDecodedAudioToMonoLinear clips everything past that. Read up to the
+	// demux early and mergeAndConsumeDecodedAudioToMonoLinear clips everything past that. Read up to the
 	// same ceiling as caption decode (demuxer stops when the track ends).
 	const readEndSec = MAX_CAPTION_AUDIO_SEC + READ_END_PADDING_SEC;
 	const decodedFrames: AudioData[] = [];
@@ -182,6 +182,6 @@ export async function extractMonoPcmViaWebDemuxer(
 	// duration, fall back to reported metadata.
 	const durationSec = inferredDurationSec > 0.02 ? inferredDurationSec : reportedDurationSec;
 
-	const mono = mergeDecodedAudioToMonoLinear(decodedFrames, sampleRate, durationSec);
+	const mono = mergeAndConsumeDecodedAudioToMonoLinear(decodedFrames, sampleRate, durationSec);
 	return { mono, sampleRate, durationSec };
 }
diff --git a/src/lib/captioning/leadingSilence.ts b/src/lib/captioning/leadingSilence.ts
index 5ad7c50f4..4bd6a11aa 100644
--- a/src/lib/captioning/leadingSilence.ts
+++ b/src/lib/captioning/leadingSilence.ts
@@ -71,13 +71,8 @@ export function shiftTrimRegionsMsForCaptionBuffer(
 	return regions
 		.map((r) => ({
 			...r,
-			startMs: r.startMs - trimMs,
-			endMs: r.endMs - trimMs,
-		}))
-		.map((r) => ({
-			...r,
-			startMs: Math.max(0, r.startMs),
-			endMs: Math.max(0, r.endMs),
+			startMs: Math.max(0, r.startMs - trimMs),
+			endMs: Math.max(0, r.endMs - trimMs),
 		}))
 		.filter((r) => r.endMs > r.startMs);
 }
diff --git a/src/lib/captioning/transcribe.ts b/src/lib/captioning/transcribe.ts
index e3c854cff..1c649c613 100644
--- a/src/lib/captioning/transcribe.ts
+++ b/src/lib/captioning/transcribe.ts
@@ -124,10 +124,9 @@ function segmentsFromTranscriberChunks(
 
 	segments.sort((u, v) => u.startSec - v.startSec || u.endSec - v.endSec);
 	const rawDeduped: CaptionSegment[] = [];
-	const CHUNK_DUP_MAX_GAP_SEC = 0.42;
 	for (const seg of segments) {
 		const prev = rawDeduped[rawDeduped.length - 1];
-		if (prev && prev.text === seg.text && seg.startSec <= prev.endSec + CHUNK_DUP_MAX_GAP_SEC) {
+		if (prev && prev.text === seg.text && seg.startSec <= prev.endSec) {
 			prev.endSec = Math.max(prev.endSec, seg.endSec);
 			prev.startSec = Math.min(prev.startSec, seg.startSec);
 			continue;

From 3a48e54db1b17d0b876f46b63ae41f3ac1ce2ac4 Mon Sep 17 00:00:00 2001
From: parse-nip <152457438+parse-nip@users.noreply.github.com>
Date: Thu, 14 May 2026 10:51:31 +0800
Subject: [PATCH 4/7] fix: time-slice single-group phrase captions into lines

Phrase mode with one merged span now splits wrapCaptionTextByWordBounds
lines into separate CaptionSegments with even time allocation across
the phrase span (fallback when duration is too short for min spans).
Update unit test accordingly.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../annotationsFromCaptions.test.ts           | 11 +++--
 src/lib/captioning/annotationsFromCaptions.ts | 41 ++++++++++++++++---
 2 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/src/lib/captioning/annotationsFromCaptions.test.ts b/src/lib/captioning/annotationsFromCaptions.test.ts
index 31679f7f5..bbf26fed2 100644
--- a/src/lib/captioning/annotationsFromCaptions.test.ts
+++ b/src/lib/captioning/annotationsFromCaptions.test.ts
@@ -24,18 +24,23 @@ describe("groupPhraseCaptionSegmentsIntoLines", () => {
 		expect(lines[0]!.endSec).toBeLessThanOrEqual(0.62);
 	});
 
-	it("wraps a single phrase into multiple visual lines without inventing extra timing regions", () => {
+	it("slices a single merged phrase into timed caption lines by word bounds", () => {
 		const lines = groupPhraseCaptionSegmentsIntoLines(
 			[{ startSec: 0, endSec: 1, text: "alpha beta gamma delta" }],
 			2,
 			2,
 		);
 
-		expect(lines).toHaveLength(1);
+		expect(lines).toHaveLength(2);
 		expect(lines[0]).toMatchObject({
 			startSec: 0,
+			endSec: 0.5,
+			text: "alpha beta",
+		});
+		expect(lines[1]).toMatchObject({
+			startSec: 0.5,
 			endSec: 1,
-			text: "alpha beta\ngamma delta",
+			text: "gamma delta",
 		});
 	});
 });
diff --git a/src/lib/captioning/annotationsFromCaptions.ts b/src/lib/captioning/annotationsFromCaptions.ts
index ca82b0477..0f6dc2af4 100644
--- a/src/lib/captioning/annotationsFromCaptions.ts
+++ b/src/lib/captioning/annotationsFromCaptions.ts
@@ -459,11 +459,42 @@ export function groupPhraseCaptionSegmentsIntoLines(
 	for (const group of groups) {
 		if (group.length === 1) {
 			const only = group[0]!;
-			out.push({
-				startSec: only.startSec,
-				endSec: only.endSec,
-				text: wrapCaptionTextByWordBounds(only.text, minWords, maxWords),
-			});
+			const wrapped = wrapCaptionTextByWordBounds(only.text, minWords, maxWords).trim();
+			if (!wrapped) continue;
+			const lineTexts = wrapped
+				.split("\n")
+				.map((t) => t.trim())
+				.filter(Boolean);
+			const n = lineTexts.length;
+			const rawDur = only.endSec - only.startSec;
+			if (n > 1 && rawDur < n * WORD_SPLIT_MIN_SPAN_SEC) {
+				out.push({
+					startSec: only.startSec,
+					endSec: only.endSec,
+					text: lineTexts.join(" "),
+				});
+				continue;
+			}
+			const dur = Math.max(rawDur, WORD_SPLIT_MIN_SPAN_SEC * n);
+			if (n <= 1) {
+				out.push({
+					startSec: only.startSec,
+					endSec: only.endSec,
+					text: lineTexts[0] ?? wrapped,
+				});
+				continue;
+			}
+			for (let i = 0; i < n; i++) {
+				const startSec = only.startSec + (dur * i) / n;
+				const boundary = only.startSec + (dur * (i + 1)) / n;
+				const endSec =
+					i === n - 1 ? only.endSec : Math.max(startSec + WORD_SPLIT_MIN_SPAN_SEC, boundary);
+				out.push({
+					startSec,
+					endSec,
+					text: lineTexts[i]!,
+				});
+			}
 			continue;
 		}
 

From a0bcd9fe90631dd2ed49d4c89c6b3683d9014d56 Mon Sep 17 00:00:00 2001
From: parse-nip <152457438+parse-nip@users.noreply.github.com>
Date: Fri, 15 May 2026 12:28:18 +0800
Subject: [PATCH 5/7] feat(editor): polish auto-caption loading toasts

Use one Sonner id for the full caption flow, show a distinct transcribing step, yield before Whisper so updates paint, match editor dark chrome on toasts, and keep pointer-events only on toast bodies. Add transcribing strings across locales.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/App.tsx                                 |  2 +-
 src/components/ui/sonner.tsx                | 14 ++++++---
 src/components/video-editor/VideoEditor.tsx | 35 +++++++++++++--------
 src/i18n/locales/ar/editor.json             |  1 +
 src/i18n/locales/en/editor.json             |  1 +
 src/i18n/locales/es/editor.json             |  1 +
 src/i18n/locales/fr/editor.json             |  1 +
 src/i18n/locales/ja-JP/editor.json          |  1 +
 src/i18n/locales/ko-KR/editor.json          |  1 +
 src/i18n/locales/tr/editor.json             |  1 +
 src/i18n/locales/zh-CN/editor.json          |  1 +
 src/i18n/locales/zh-TW/editor.json          |  1 +
 src/lib/captioning/transcribe.ts            | 17 ++++++++++
 13 files changed, 58 insertions(+), 19 deletions(-)

diff --git a/src/App.tsx b/src/App.tsx
index 6f737b9b0..78cf66c00 100644
--- a/src/App.tsx
+++ b/src/App.tsx
@@ -82,7 +82,7 @@ export default function App() {
 	return (
 		<TooltipProvider>
 			{content}
-			<Toaster theme="dark" className="pointer-events-auto" />
+			<Toaster theme="dark" />
 		</TooltipProvider>
 	);
 }
diff --git a/src/components/ui/sonner.tsx b/src/components/ui/sonner.tsx
index fe3a3906a..3076ac1fe 100644
--- a/src/components/ui/sonner.tsx
+++ b/src/components/ui/sonner.tsx
@@ -1,18 +1,22 @@
 import { Toaster as Sonner } from "sonner";
+import { cn } from "@/lib/utils";
 
 type ToasterProps = React.ComponentProps<typeof Sonner>;
 
-const Toaster = ({ ...props }: ToasterProps) => {
+const Toaster = ({ className, ...props }: ToasterProps) => {
 	return (
 		<Sonner
-			theme="light"
-			className="toaster group"
+			theme="dark"
+			className={cn(
+				"dark toaster group pointer-events-none [&_[data-sonner-toast]]:pointer-events-auto",
+				className,
+			)}
 			duration={3000}
 			toastOptions={{
 				classNames: {
 					toast:
-						"group toast group-[.toaster]:bg-background group-[.toaster]:text-foreground group-[.toaster]:border-border group-[.toaster]:shadow-lg",
-					description: "group-[.toast]:text-muted-foreground",
+						"group toast border border-white/10 bg-[#09090b] text-slate-200 shadow-lg backdrop-blur-xl",
+					description: "group-[.toast]:text-slate-400",
 					actionButton: "group-[.toast]:bg-primary group-[.toast]:text-primary-foreground",
 					cancelButton: "group-[.toast]:bg-muted group-[.toast]:text-muted-foreground",
 				},
diff --git a/src/components/video-editor/VideoEditor.tsx b/src/components/video-editor/VideoEditor.tsx
index c2d264dfa..20a801664 100644
--- a/src/components/video-editor/VideoEditor.tsx
+++ b/src/components/video-editor/VideoEditor.tsx
@@ -109,6 +109,9 @@ import {
 import { UnsavedChangesDialog } from "./UnsavedChangesDialog";
 import VideoPlayback, { VideoPlaybackRef } from "./VideoPlayback";
 
+/** Single Sonner slot for auto-caption progress so phases update in place instead of stacking. */
+const AUTO_CAPTION_PROGRESS_TOAST_ID = "auto-caption-progress";
+
 function isClickInteractionType(interactionType: string | null | undefined) {
 	return (
 		interactionType === "click" ||
@@ -2030,18 +2033,18 @@ export default function VideoEditor() {
 
 			isAutoCaptioningRef.current = true;
 			setIsAutoCaptioning(true);
-			const toastId = toast.loading(t("autoCaptions.generating"));
+			toast.loading(t("autoCaptions.generating"), { id: AUTO_CAPTION_PROGRESS_TOAST_ID });
 			try {
 				const { samples, truncated, durationSec } = await extractMono16kFromVideoUrl(videoPath);
 				if (!Number.isFinite(durationSec) || durationSec <= 0 || samples.length < 800) {
-					toast.dismiss(toastId);
+					toast.dismiss(AUTO_CAPTION_PROGRESS_TOAST_ID);
 					toast.error(t("autoCaptions.noAudio"));
 					return;
 				}
 
 				const { samples: speechSamples, trimSec } = trimLeadingSilenceMono16k(samples);
 				if (speechSamples.length < 800) {
-					toast.dismiss(toastId);
+					toast.dismiss(AUTO_CAPTION_PROGRESS_TOAST_ID);
 					toast.error(t("autoCaptions.noAudio"));
 					return;
 				}
@@ -2052,7 +2055,13 @@ export default function VideoEditor() {
 				const transcribeOptions = {
 					onStatus: (phase: "model" | "transcribe") => {
 						if (phase === "model") {
-							toast.loading(t("autoCaptions.loadingModel"), { id: toastId });
+							toast.loading(t("autoCaptions.loadingModel"), {
+								id: AUTO_CAPTION_PROGRESS_TOAST_ID,
+							});
+						} else {
+							toast.loading(t("autoCaptions.transcribing"), {
+								id: AUTO_CAPTION_PROGRESS_TOAST_ID,
+							});
 						}
 					},
 				};
@@ -2111,7 +2120,7 @@ export default function VideoEditor() {
 				}
 
 				if (regions.length === 0) {
-					toast.dismiss(toastId);
+					toast.dismiss(AUTO_CAPTION_PROGRESS_TOAST_ID);
 					toast.info(t("autoCaptions.noneHeard"));
 					return;
 				}
@@ -2120,18 +2129,18 @@ export default function VideoEditor() {
 				nextAnnotationIdRef.current = nextNumericId;
 				nextAnnotationZIndexRef.current = nextZIndex;
 
-				toast.dismiss(toastId);
+				toast.dismiss(AUTO_CAPTION_PROGRESS_TOAST_ID);
+				const minutesTrunc = String(Math.round(MAX_CAPTION_AUDIO_SEC / 60));
 				if (truncated) {
-					toast.info(
-						t("autoCaptions.truncated", {
-							minutes: String(Math.round(MAX_CAPTION_AUDIO_SEC / 60)),
-						}),
-					);
+					toast.success(t("autoCaptions.done", { count: String(regions.length) }), {
+						description: t("autoCaptions.truncated", { minutes: minutesTrunc }),
+					});
+				} else {
+					toast.success(t("autoCaptions.done", { count: String(regions.length) }));
 				}
-				toast.success(t("autoCaptions.done", { count: String(regions.length) }));
 			} catch (e) {
 				console.error(e);
-				toast.dismiss(toastId);
+				toast.dismiss(AUTO_CAPTION_PROGRESS_TOAST_ID);
 				const detail = e instanceof Error ? e.message : String(e);
 				toast.error(t("autoCaptions.failed"), { description: detail });
 			} finally {
diff --git a/src/i18n/locales/ar/editor.json b/src/i18n/locales/ar/editor.json
index 35ba08b23..de48f2614 100644
--- a/src/i18n/locales/ar/editor.json
+++ b/src/i18n/locales/ar/editor.json
@@ -53,6 +53,7 @@
 		"dialogCancel": "إلغاء",
 		"generating": "جارٍ توليد التسميات من الصوت…",
 		"loadingModel": "جارٍ تحميل نموذج الكلام (سيتم تنزيل ~75 ميغابايت عند الاستخدام الأول)…",
+		"transcribing": "جارٍ نسخ الكلام إلى نص…",
 		"busy": "توليد التسميات قيد التنفيذ بالفعل.",
 		"done": "تمت إضافة {{count}} تسمية.",
 		"noneHeard": "لم يتم الكشف عن أي كلام.",
diff --git a/src/i18n/locales/en/editor.json b/src/i18n/locales/en/editor.json
index ce62aeddc..0473efa4b 100644
--- a/src/i18n/locales/en/editor.json
+++ b/src/i18n/locales/en/editor.json
@@ -53,6 +53,7 @@
 		"dialogCancel": "Cancel",
 		"generating": "Generating captions from audio…",
 		"loadingModel": "Loading speech model (first use downloads ~75 MB)…",
+		"transcribing": "Transcribing speech…",
 		"busy": "Caption generation is already in progress.",
 		"done": "Added {{count}} captions.",
 		"noneHeard": "No speech was detected.",
diff --git a/src/i18n/locales/es/editor.json b/src/i18n/locales/es/editor.json
index c7a2c2aff..c5a75b1a1 100644
--- a/src/i18n/locales/es/editor.json
+++ b/src/i18n/locales/es/editor.json
@@ -53,6 +53,7 @@
 		"dialogCancel": "Cancelar",
 		"generating": "Generando subtítulos a partir del audio…",
 		"loadingModel": "Cargando el modelo de voz (el primer uso descarga ~75 MB)…",
+		"transcribing": "Transcribiendo el habla…",
 		"busy": "La generación de subtítulos ya está en curso.",
 		"done": "Se añadieron {{count}} subtítulos.",
 		"noneHeard": "No se detectó voz.",
diff --git a/src/i18n/locales/fr/editor.json b/src/i18n/locales/fr/editor.json
index e1acdc741..8a5506f5f 100644
--- a/src/i18n/locales/fr/editor.json
+++ b/src/i18n/locales/fr/editor.json
@@ -53,6 +53,7 @@
 		"dialogCancel": "Annuler",
 		"generating": "Génération des sous-titres à partir de l'audio…",
 		"loadingModel": "Chargement du modèle vocal (le premier usage télécharge ~75 MB)…",
+		"transcribing": "Transcription de la parole…",
 		"busy": "La génération des sous-titres est déjà en cours.",
 		"done": "{{count}} sous-titres ajoutés.",
 		"noneHeard": "Aucune parole n'a été détectée.",
diff --git a/src/i18n/locales/ja-JP/editor.json b/src/i18n/locales/ja-JP/editor.json
index fb755db6a..30c83e9cc 100644
--- a/src/i18n/locales/ja-JP/editor.json
+++ b/src/i18n/locales/ja-JP/editor.json
@@ -53,6 +53,7 @@
 		"dialogCancel": "キャンセル",
 		"generating": "音声からキャプションを生成しています…",
 		"loadingModel": "音声モデルを読み込んでいます（初回利用時は約 75 MB をダウンロードします）…",
+		"transcribing": "音声を文字起こししています…",
 		"busy": "キャプションの生成はすでに実行中です。",
 		"done": "{{count}} 件のキャプションを追加しました。",
 		"noneHeard": "音声が検出されませんでした。",
diff --git a/src/i18n/locales/ko-KR/editor.json b/src/i18n/locales/ko-KR/editor.json
index 51712b22f..e429d1b6d 100644
--- a/src/i18n/locales/ko-KR/editor.json
+++ b/src/i18n/locales/ko-KR/editor.json
@@ -53,6 +53,7 @@
 		"dialogCancel": "취소",
 		"generating": "오디오에서 자막을 생성하는 중…",
 		"loadingModel": "음성 모델을 불러오는 중(첫 사용 시 약 75MB 다운로드)…",
+		"transcribing": "음성을 전사하는 중…",
 		"busy": "자막 생성이 이미 진행 중입니다.",
 		"done": "자막 {{count}}개를 추가했습니다.",
 		"noneHeard": "음성이 감지되지 않았습니다.",
diff --git a/src/i18n/locales/tr/editor.json b/src/i18n/locales/tr/editor.json
index e85ae5ebe..475bbd3e5 100644
--- a/src/i18n/locales/tr/editor.json
+++ b/src/i18n/locales/tr/editor.json
@@ -53,6 +53,7 @@
 		"dialogCancel": "İptal",
 		"generating": "Sesten altyazılar oluşturuluyor…",
 		"loadingModel": "Konuşma modeli yükleniyor (ilk kullanımda ~75 MB indirilir)…",
+		"transcribing": "Konuşma yazıya dökülüyor…",
 		"busy": "Altyazı oluşturma zaten devam ediyor.",
 		"done": "{{count}} altyazı eklendi.",
 		"noneHeard": "Konuşma algılanmadı.",
diff --git a/src/i18n/locales/zh-CN/editor.json b/src/i18n/locales/zh-CN/editor.json
index 4f576f4c4..2b6b57425 100644
--- a/src/i18n/locales/zh-CN/editor.json
+++ b/src/i18n/locales/zh-CN/editor.json
@@ -53,6 +53,7 @@
 		"dialogCancel": "取消",
 		"generating": "正在从音频生成字幕…",
 		"loadingModel": "正在加载语音模型（首次使用将下载约 75 MB）…",
+		"transcribing": "正在转写语音…",
 		"busy": "字幕生成已在进行中。",
 		"done": "已添加 {{count}} 条字幕。",
 		"noneHeard": "未检测到语音。",
diff --git a/src/i18n/locales/zh-TW/editor.json b/src/i18n/locales/zh-TW/editor.json
index b61f408eb..dcc4dce56 100644
--- a/src/i18n/locales/zh-TW/editor.json
+++ b/src/i18n/locales/zh-TW/editor.json
@@ -53,6 +53,7 @@
 		"dialogCancel": "取消",
 		"generating": "正在從音訊產生字幕…",
 		"loadingModel": "正在載入語音模型（首次使用將下載約 75 MB）…",
+		"transcribing": "正在轉錄語音…",
 		"busy": "字幕產生已在進行中。",
 		"done": "已新增 {{count}} 條字幕。",
 		"noneHeard": "未偵測到語音。",
diff --git a/src/lib/captioning/transcribe.ts b/src/lib/captioning/transcribe.ts
index 1c649c613..83fb36510 100644
--- a/src/lib/captioning/transcribe.ts
+++ b/src/lib/captioning/transcribe.ts
@@ -18,6 +18,18 @@ function segmentOverlapsTrim(startMs: number, endMs: number, trims: TrimRegion[]
 	return trims.some((t) => startMs < t.endMs && endMs > t.startMs);
 }
 
+/** Lets the browser paint toast / in-app status before Whisper blocks the main thread (WASM may not yield). */
+async function yieldForUiPaint(): Promise<void> {
+	await new Promise<void>((resolve) => setTimeout(resolve, 0));
+	await new Promise<void>((resolve) => {
+		requestAnimationFrame(() => {
+			requestAnimationFrame(() => resolve());
+		});
+	});
+	// macrotask after rAF so React/Sonner state can commit under load.
+	await new Promise<void>((resolve) => setTimeout(resolve, 0));
+}
+
 /**
  * ONNX Runtime's wasm bundle treats `process.versions.node` (present in Electron's
  * renderer) as Node and tries `require("fs")`, which Vite does not support. Mask it
@@ -212,8 +224,13 @@ export async function transcribeMono16kToSegments(
 
 		if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
 
+		await yieldForUiPaint();
+
 		const trims = options?.trimRegions ?? [];
 		options?.onStatus?.("transcribe");
+		if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
+		await yieldForUiPaint();
+		if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
 
 		const transcribeOne = async (
 			ignoreTrims: boolean,

From 2ee00a7cfa35b2875b3e2f29c3241b08e95ccfca Mon Sep 17 00:00:00 2001
From: parse-nip <152457438+parse-nip@users.noreply.github.com>
Date: Fri, 15 May 2026 12:38:10 +0800
Subject: [PATCH 6/7] fix(captioning): narrow ORT node shim, trim retries, and
 abort checks

Scope withoutNodeVersion to Transformers import/pipeline only; reapply trim-region filtering after ignore-trims retries; check AbortSignal after each slice inference before chunk processing.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/lib/captioning/transcribe.ts | 176 ++++++++++++++++++-------------
 1 file changed, 100 insertions(+), 76 deletions(-)

diff --git a/src/lib/captioning/transcribe.ts b/src/lib/captioning/transcribe.ts
index 83fb36510..e85f15157 100644
--- a/src/lib/captioning/transcribe.ts
+++ b/src/lib/captioning/transcribe.ts
@@ -18,6 +18,19 @@ function segmentOverlapsTrim(startMs: number, endMs: number, trims: TrimRegion[]
 	return trims.some((t) => startMs < t.endMs && endMs > t.startMs);
 }
 
+/** Same trim-out rule as {@link segmentsFromTranscriberChunks}; for retry passes that used empty trims. */
+function dropSegmentsOverlappingTrimRegions(
+	segments: CaptionSegment[],
+	trimRegions: TrimRegion[],
+): CaptionSegment[] {
+	if (trimRegions.length === 0) return segments;
+	return segments.filter((s) => {
+		const startMs = Math.round(s.startSec * 1000);
+		const endMs = Math.round(s.endSec * 1000);
+		return !segmentOverlapsTrim(startMs, endMs, trimRegions);
+	});
+}
+
 /** Lets the browser paint toast / in-app status before Whisper blocks the main thread (WASM may not yield). */
 async function yieldForUiPaint(): Promise<void> {
 	await new Promise<void>((resolve) => setTimeout(resolve, 0));
@@ -211,7 +224,9 @@ export async function transcribeMono16kToSegments(
 		signal?: AbortSignal;
 	},
 ): Promise<TranscribeMono16kResult> {
-	return withoutNodeVersion(async () => {
+	if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
+
+	const { transcriber } = await withoutNodeVersion(async () => {
 		const { pipeline, env } = await import("@xenova/transformers");
 		env.allowLocalModels = false;
 
@@ -220,92 +235,101 @@ export async function transcribeMono16kToSegments(
 		options?.onStatus?.("model");
 		// Default tiny weights only: the `output_attentions` revision has regressed inference for
 		// some environments (empty chunks / thrown errors) while phrase mode works on this model.
-		const transcriber = await pipeline("automatic-speech-recognition", "Xenova/whisper-tiny");
+		const t = await pipeline("automatic-speech-recognition", "Xenova/whisper-tiny");
+		return { transcriber: t };
+	});
 
-		if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
+	if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
 
-		await yieldForUiPaint();
+	await yieldForUiPaint();
 
-		const trims = options?.trimRegions ?? [];
-		options?.onStatus?.("transcribe");
-		if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
-		await yieldForUiPaint();
-		if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
+	const trims = options?.trimRegions ?? [];
+	options?.onStatus?.("transcribe");
+	if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
+	await yieldForUiPaint();
+	if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
+
+	const transcribeOne = async (
+		ignoreTrims: boolean,
+		forceFullSequences: boolean,
+		timestampMode: "word" | "phrase",
+	): Promise<CaptionSegment[]> => {
+		try {
+			const activeTrims = ignoreTrims ? [] : trims;
+			if (samples.length <= TRANSCRIBE_SLICE_SAMPLES) {
+				const { slice, realDurationSec } = padTailSliceForTranscribe(samples);
+				const result = await runTranscriberOnSlice(transcriber, slice, {
+					forceFullSequences,
+					timestampMode,
+				});
+				if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
+				return segmentsFromTranscriberChunks(
+					extractChunksFromAsrResult(result),
+					0,
+					activeTrims,
+					realDurationSec,
+				);
+			}
+
+			const all: CaptionSegment[] = [];
+			for (let offset = 0; offset < samples.length; offset += TRANSCRIBE_SLICE_SAMPLES) {
+				if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
+				const end = Math.min(offset + TRANSCRIBE_SLICE_SAMPLES, samples.length);
+				const sliceRaw = samples.subarray(offset, end);
+				const isFinalSlice = end >= samples.length;
+				if (sliceRaw.length === 0) continue;
+				if (sliceRaw.length < MIN_TRANSCRIBE_SLICE_SAMPLES && !isFinalSlice) continue;
 
-		const transcribeOne = async (
-			ignoreTrims: boolean,
-			forceFullSequences: boolean,
-			timestampMode: "word" | "phrase",
-		): Promise<CaptionSegment[]> => {
-			try {
-				const activeTrims = ignoreTrims ? [] : trims;
-				if (samples.length <= TRANSCRIBE_SLICE_SAMPLES) {
-					const { slice, realDurationSec } = padTailSliceForTranscribe(samples);
-					const result = await runTranscriberOnSlice(transcriber, slice, {
-						forceFullSequences,
-						timestampMode,
-					});
-					return segmentsFromTranscriberChunks(
+				const { slice, realDurationSec } =
+					sliceRaw.length < MIN_TRANSCRIBE_SLICE_SAMPLES && isFinalSlice
+						? padTailSliceForTranscribe(sliceRaw)
+						: { slice: sliceRaw, realDurationSec: sliceRaw.length / 16_000 };
+
+				const result = await runTranscriberOnSlice(transcriber, slice, {
+					forceFullSequences,
+					timestampMode,
+				});
+				if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
+				const tOff = offset / 16_000;
+				all.push(
+					...segmentsFromTranscriberChunks(
 						extractChunksFromAsrResult(result),
-						0,
+						tOff,
 						activeTrims,
 						realDurationSec,
-					);
-				}
-
-				const all: CaptionSegment[] = [];
-				for (let offset = 0; offset < samples.length; offset += TRANSCRIBE_SLICE_SAMPLES) {
-					if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
-					const end = Math.min(offset + TRANSCRIBE_SLICE_SAMPLES, samples.length);
-					const sliceRaw = samples.subarray(offset, end);
-					const isFinalSlice = end >= samples.length;
-					if (sliceRaw.length === 0) continue;
-					if (sliceRaw.length < MIN_TRANSCRIBE_SLICE_SAMPLES && !isFinalSlice) continue;
-
-					const { slice, realDurationSec } =
-						sliceRaw.length < MIN_TRANSCRIBE_SLICE_SAMPLES && isFinalSlice
-							? padTailSliceForTranscribe(sliceRaw)
-							: { slice: sliceRaw, realDurationSec: sliceRaw.length / 16_000 };
-
-					const result = await runTranscriberOnSlice(transcriber, slice, {
-						forceFullSequences,
-						timestampMode,
-					});
-					const tOff = offset / 16_000;
-					all.push(
-						...segmentsFromTranscriberChunks(
-							extractChunksFromAsrResult(result),
-							tOff,
-							activeTrims,
-							realDurationSec,
-						),
-					);
-				}
-				return all;
-			} catch (e) {
-				if (e instanceof DOMException && e.name === "AbortError") throw e;
-				console.warn("[captioning] Whisper pass failed:", e);
-				return [];
+					),
+				);
 			}
-		};
+			return all;
+		} catch (e) {
+			if (e instanceof DOMException && e.name === "AbortError") throw e;
+			console.warn("[captioning] Whisper pass failed:", e);
+			return [];
+		}
+	};
 
-		const attemptModes: Array<"word" | "phrase"> = ["word", "phrase"];
-		for (const timestampMode of attemptModes) {
-			let segments = await transcribeOne(false, true, timestampMode);
+	const attemptModes: Array<"word" | "phrase"> = ["word", "phrase"];
+	for (const timestampMode of attemptModes) {
+		let segments = await transcribeOne(false, true, timestampMode);
+		if (segments.length === 0) {
+			segments = await transcribeOne(false, false, timestampMode);
+		}
+		if (segments.length === 0 && trims.length > 0) {
+			segments = dropSegmentsOverlappingTrimRegions(
+				await transcribeOne(true, true, timestampMode),
+				trims,
+			);
 			if (segments.length === 0) {
-				segments = await transcribeOne(false, false, timestampMode);
-			}
-			if (segments.length === 0 && trims.length > 0) {
-				segments = await transcribeOne(true, true, timestampMode);
-				if (segments.length === 0) {
-					segments = await transcribeOne(true, false, timestampMode);
-				}
-			}
-			if (segments.length > 0) {
-				return { segments, granularity: timestampMode };
+				segments = dropSegmentsOverlappingTrimRegions(
+					await transcribeOne(true, false, timestampMode),
+					trims,
+				);
 			}
 		}
+		if (segments.length > 0) {
+			return { segments, granularity: timestampMode };
+		}
+	}
 
-		return { segments: [], granularity: "phrase" };
-	});
+	return { segments: [], granularity: "phrase" };
 }

From e5462f1ac81e2c361c7385953c531c4d96661308 Mon Sep 17 00:00:00 2001
From: Siddharth <siddharthvaddem@yahoo.com>
Date: Mon, 1 Jun 2026 20:42:41 -0700
Subject: [PATCH 7/7] captions: run Whisper in a Web Worker, move auto-captions
 button to timeline

Move model load and transcription off the renderer main thread into a
dedicated worker (transcribe.worker.ts + transcribeCore.ts) so the editor
UI no longer freezes during captioning. Relocate the auto-captions action
from the editor header into the timeline toolbar, and add the missing
it/pt-BR/ru/vi autoCaptions translations.
---
 package-lock.json                             |   1 -
 src/components/video-editor/VideoEditor.tsx   |  36 +-
 .../video-editor/timeline/TimelineEditor.tsx  |  21 +
 src/i18n/locales/it/editor.json               |  19 +
 src/i18n/locales/pt-BR/editor.json            |  19 +
 src/i18n/locales/ru/editor.json               |  19 +
 src/i18n/locales/vi/editor.json               |  19 +
 src/lib/captioning/transcribe.ts              | 360 +++---------------
 src/lib/captioning/transcribe.worker.ts       |  81 ++++
 src/lib/captioning/transcribeCore.ts          | 269 +++++++++++++
 vite.config.ts                                |   5 +
 11 files changed, 523 insertions(+), 326 deletions(-)
 create mode 100644 src/lib/captioning/transcribe.worker.ts
 create mode 100644 src/lib/captioning/transcribeCore.ts

diff --git a/package-lock.json b/package-lock.json
index 4862a12a5..d4abbfbbe 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -7,7 +7,6 @@
 		"": {
 			"name": "openscreen",
 			"version": "1.4.0",
-			"hasInstallScript": true,
 			"dependencies": {
 				"@fix-webm-duration/fix": "^1.0.1",
 				"@pixi/filter-drop-shadow": "^5.2.0",
diff --git a/src/components/video-editor/VideoEditor.tsx b/src/components/video-editor/VideoEditor.tsx
index ec46d8582..dda653274 100644
--- a/src/components/video-editor/VideoEditor.tsx
+++ b/src/components/video-editor/VideoEditor.tsx
@@ -1,7 +1,6 @@
 import type { Span } from "dnd-timeline";
 import { FolderOpen, Languages, Save, Video } from "lucide-react";
 import { type CSSProperties, useCallback, useEffect, useMemo, useRef, useState } from "react";
-import { MdClosedCaption } from "react-icons/md";
 import { Panel, PanelGroup, PanelResizeHandle } from "react-resizable-panels";
 import { toast } from "sonner";
 import { Button } from "@/components/ui/button";
@@ -2404,28 +2403,6 @@ export default function VideoEditor() {
 						<Save size={14} />
 						{ts("project.save")}
 					</button>
-					<Button
-						type="button"
-						variant="ghost"
-						size="sm"
-						disabled={isAutoCaptioning || !videoPath}
-						onClick={() => {
-							if (!videoPath) {
-								toast.error(t("errors.noVideoLoaded"));
-								return;
-							}
-							if (isAutoCaptioningRef.current) {
-								toast.error(t("autoCaptions.busy"));
-								return;
-							}
-							setShowAutoCaptionsDialog(true);
-						}}
-						className="h-7 px-2 text-white/50 hover:text-white/90 hover:bg-white/10 text-[11px] font-medium gap-1"
-						style={{ WebkitAppRegion: "no-drag" } as CSSProperties}
-					>
-						<MdClosedCaption className="size-3.5 shrink-0" aria-hidden />
-						{t("autoCaptions.button")}
-					</Button>
 				</div>
 			</div>
 
@@ -2781,6 +2758,19 @@ export default function VideoEditor() {
 									}
 									videoUrl={videoPath ?? undefined}
 									showTrimWaveform={showTrimWaveform}
+									captionsLabel={t("autoCaptions.button")}
+									isGeneratingCaptions={isAutoCaptioning}
+									onGenerateCaptions={() => {
+										if (!videoPath) {
+											toast.error(t("errors.noVideoLoaded"));
+											return;
+										}
+										if (isAutoCaptioningRef.current) {
+											toast.error(t("autoCaptions.busy"));
+											return;
+										}
+										setShowAutoCaptionsDialog(true);
+									}}
 								/>
 							</div>
 						</Panel>
diff --git a/src/components/video-editor/timeline/TimelineEditor.tsx b/src/components/video-editor/timeline/TimelineEditor.tsx
index f84d038a9..65ebd8bdb 100644
--- a/src/components/video-editor/timeline/TimelineEditor.tsx
+++ b/src/components/video-editor/timeline/TimelineEditor.tsx
@@ -1,6 +1,7 @@
 import type { Range, Span } from "dnd-timeline";
 import { useTimelineContext } from "dnd-timeline";
 import {
+	Captions,
 	Check,
 	ChevronDown,
 	Gauge,
@@ -92,6 +93,11 @@ interface TimelineEditorProps {
 	onAspectRatioChange: (aspectRatio: AspectRatio) => void;
 	videoUrl?: string;
 	showTrimWaveform?: boolean;
+	/** Opens the auto-captions flow. When omitted, the captions button is hidden. */
+	onGenerateCaptions?: () => void;
+	isGeneratingCaptions?: boolean;
+	/** Localized label for the auto-captions button (lives in the `editor` namespace). */
+	captionsLabel?: string;
 }
 
 interface TimelineScaleConfig {
@@ -924,6 +930,9 @@ export default function TimelineEditor({
 	onAspectRatioChange,
 	videoUrl,
 	showTrimWaveform = false,
+	onGenerateCaptions,
+	isGeneratingCaptions = false,
+	captionsLabel,
 }: TimelineEditorProps) {
 	const t = useScopedT("timeline");
 	const totalMs = useMemo(() => Math.max(0, Math.round(videoDuration * 1000)), [videoDuration]);
@@ -1659,6 +1668,18 @@ export default function TimelineEditor({
 					>
 						<Gauge className="w-4 h-4" />
 					</Button>
+					{onGenerateCaptions && (
+						<Button
+							onClick={onGenerateCaptions}
+							disabled={isGeneratingCaptions || !videoUrl}
+							variant="ghost"
+							size="icon"
+							className="h-7 w-7 rounded-lg text-slate-400 hover:text-[#a78bfa] hover:bg-[#a78bfa]/10 transition-all"
+							title={captionsLabel}
+						>
+							<Captions className="w-4 h-4" />
+						</Button>
+					)}
 				</div>
 				<div className="flex items-center gap-1.5 min-w-0">
 					<DropdownMenu>
diff --git a/src/i18n/locales/it/editor.json b/src/i18n/locales/it/editor.json
index 336d3e6ba..0e94b9a9f 100644
--- a/src/i18n/locales/it/editor.json
+++ b/src/i18n/locales/it/editor.json
@@ -42,5 +42,24 @@
 		"cameraNotFound": "Fotocamera non trovata.",
 		"permissionDenied": "Autorizzazione di registrazione negata. Consenti la registrazione dello schermo.",
 		"accessibilityAllowAndRetry": "Consenti l'accesso all'accessibilità per OpenScreen, poi premi di nuovo registra per avviare il conto alla rovescia."
+	},
+	"autoCaptions": {
+		"button": "Sottotitoli automatici",
+		"dialogTitle": "Sottotitoli automatici",
+		"dialogDescription": "Scegli all'incirca quante parole mostrare per ogni sottotitolo. La temporizzazione viene distribuita tra le parole della frase.",
+		"minWords": "Numero minimo di parole per sottotitolo",
+		"maxWords": "Numero massimo di parole per sottotitolo",
+		"wordsCount": "{{count}} parole",
+		"generate": "Genera",
+		"dialogCancel": "Annulla",
+		"generating": "Generazione dei sottotitoli dall'audio…",
+		"loadingModel": "Caricamento del modello vocale (al primo utilizzo vengono scaricati ~75 MB)…",
+		"transcribing": "Trascrizione del parlato…",
+		"busy": "La generazione dei sottotitoli è già in corso.",
+		"done": "Aggiunti {{count}} sottotitoli.",
+		"noneHeard": "Nessun parlato rilevato.",
+		"noAudio": "Questo video non contiene audio utilizzabile per la trascrizione.",
+		"failed": "Impossibile generare i sottotitoli.",
+		"truncated": "Sono stati trascritti solo i primi {{minutes}} minuti."
 	}
 }
diff --git a/src/i18n/locales/pt-BR/editor.json b/src/i18n/locales/pt-BR/editor.json
index 7e3f69531..b0e9ab8c9 100644
--- a/src/i18n/locales/pt-BR/editor.json
+++ b/src/i18n/locales/pt-BR/editor.json
@@ -41,5 +41,24 @@
 		"cameraDisconnected": "Webcam desconectada.",
 		"cameraNotFound": "Câmera não encontrada.",
 		"permissionDenied": "Permissão de gravação negada. Por favor, permita a gravação de tela."
+	},
+	"autoCaptions": {
+		"button": "Legendas automáticas",
+		"dialogTitle": "Legendas automáticas",
+		"dialogDescription": "Escolha aproximadamente quantas palavras cada legenda mostra de cada vez. O tempo é distribuído entre as palavras da frase.",
+		"minWords": "Mínimo de palavras por legenda",
+		"maxWords": "Máximo de palavras por legenda",
+		"wordsCount": "{{count}} palavras",
+		"generate": "Gerar",
+		"dialogCancel": "Cancelar",
+		"generating": "Gerando legendas a partir do áudio…",
+		"loadingModel": "Carregando o modelo de fala (o primeiro uso baixa ~75 MB)…",
+		"transcribing": "Transcrevendo a fala…",
+		"busy": "A geração de legendas já está em andamento.",
+		"done": "{{count}} legendas adicionadas.",
+		"noneHeard": "Nenhuma fala foi detectada.",
+		"noAudio": "Este vídeo não tem áudio utilizável para transcrição.",
+		"failed": "Não foi possível gerar as legendas.",
+		"truncated": "Apenas os primeiros {{minutes}} minutos foram transcritos."
 	}
 }
diff --git a/src/i18n/locales/ru/editor.json b/src/i18n/locales/ru/editor.json
index ff0c80b8b..78fa129a1 100644
--- a/src/i18n/locales/ru/editor.json
+++ b/src/i18n/locales/ru/editor.json
@@ -44,6 +44,25 @@
 		"permissionDenied": "Разрешение на запись запрещено. Пожалуйста, разрешите запись экрана.",
 		"accessibilityAllowAndRetry": "Разрешите OpenScreen доступ к Универсальному доступу, затем снова нажмите запись, чтобы начать обратный отсчет."
 	},
+	"autoCaptions": {
+		"button": "Автосубтитры",
+		"dialogTitle": "Автосубтитры",
+		"dialogDescription": "Выберите, сколько примерно слов показывать в одном субтитре. Время распределяется между словами фразы.",
+		"minWords": "Минимум слов в субтитре",
+		"maxWords": "Максимум слов в субтитре",
+		"wordsCount": "{{count}} слов",
+		"generate": "Создать",
+		"dialogCancel": "Отмена",
+		"generating": "Создание субтитров из звука…",
+		"loadingModel": "Загрузка речевой модели (при первом запуске скачивается ~75 МБ)…",
+		"transcribing": "Распознавание речи…",
+		"busy": "Создание субтитров уже выполняется.",
+		"done": "Добавлено субтитров: {{count}}.",
+		"noneHeard": "Речь не обнаружена.",
+		"noAudio": "В этом видео нет звука, пригодного для расшифровки.",
+		"failed": "Не удалось создать субтитры.",
+		"truncated": "Расшифрованы только первые {{minutes}} мин."
+	},
 	"emptyState": {
 		"title": "Нет открытых проектов",
 		"description": "Импортируйте видео для начала редактирования или загрузите существующий проект OpenScreen.",
diff --git a/src/i18n/locales/vi/editor.json b/src/i18n/locales/vi/editor.json
index 1875bb559..90004091e 100644
--- a/src/i18n/locales/vi/editor.json
+++ b/src/i18n/locales/vi/editor.json
@@ -44,6 +44,25 @@
 		"permissionDenied": "Quyền ghi hình bị từ chối. Vui lòng cho phép ghi màn hình.",
 		"accessibilityAllowAndRetry": "Cho phép OpenScreen truy cập Trợ năng, sau đó nhấn ghi lại để bắt đầu đếm ngược."
 	},
+	"autoCaptions": {
+		"button": "Phụ đề tự động",
+		"dialogTitle": "Phụ đề tự động",
+		"dialogDescription": "Chọn khoảng bao nhiêu từ mỗi phụ đề hiển thị cùng lúc. Thời gian được phân bổ cho các từ trong cụm từ đó.",
+		"minWords": "Số từ tối thiểu mỗi phụ đề",
+		"maxWords": "Số từ tối đa mỗi phụ đề",
+		"wordsCount": "{{count}} từ",
+		"generate": "Tạo",
+		"dialogCancel": "Hủy",
+		"generating": "Đang tạo phụ đề từ âm thanh…",
+		"loadingModel": "Đang tải mô hình giọng nói (lần đầu sử dụng sẽ tải ~75 MB)…",
+		"transcribing": "Đang chuyển lời nói thành văn bản…",
+		"busy": "Việc tạo phụ đề đang được tiến hành.",
+		"done": "Đã thêm {{count}} phụ đề.",
+		"noneHeard": "Không phát hiện thấy lời nói.",
+		"noAudio": "Video này không có âm thanh dùng được để chuyển thành văn bản.",
+		"failed": "Không thể tạo phụ đề.",
+		"truncated": "Chỉ {{minutes}} phút đầu tiên được chuyển thành văn bản."
+	},
 	"emptyState": {
 		"title": "Không có dự án nào được mở",
 		"description": "Nhập video để bắt đầu chỉnh sửa hoặc tải một dự án OpenScreen hiện có.",
diff --git a/src/lib/captioning/transcribe.ts b/src/lib/captioning/transcribe.ts
index e85f15157..91f1d91f0 100644
--- a/src/lib/captioning/transcribe.ts
+++ b/src/lib/captioning/transcribe.ts
@@ -14,209 +14,27 @@ export interface TranscribeMono16kResult {
 	granularity: CaptionTimestampGranularity;
 }
 
-function segmentOverlapsTrim(startMs: number, endMs: number, trims: TrimRegion[]): boolean {
-	return trims.some((t) => startMs < t.endMs && endMs > t.startMs);
+/** Request payload posted from the renderer to the transcription worker. */
+export interface TranscribeWorkerRequest {
+	samples: Float32Array;
+	trimRegions: TrimRegion[];
 }
 
-/** Same trim-out rule as {@link segmentsFromTranscriberChunks}; for retry passes that used empty trims. */
-function dropSegmentsOverlappingTrimRegions(
-	segments: CaptionSegment[],
-	trimRegions: TrimRegion[],
-): CaptionSegment[] {
-	if (trimRegions.length === 0) return segments;
-	return segments.filter((s) => {
-		const startMs = Math.round(s.startSec * 1000);
-		const endMs = Math.round(s.endSec * 1000);
-		return !segmentOverlapsTrim(startMs, endMs, trimRegions);
-	});
-}
-
-/** Lets the browser paint toast / in-app status before Whisper blocks the main thread (WASM may not yield). */
-async function yieldForUiPaint(): Promise<void> {
-	await new Promise<void>((resolve) => setTimeout(resolve, 0));
-	await new Promise<void>((resolve) => {
-		requestAnimationFrame(() => {
-			requestAnimationFrame(() => resolve());
-		});
-	});
-	// macrotask after rAF so React/Sonner state can commit under load.
-	await new Promise<void>((resolve) => setTimeout(resolve, 0));
-}
+/** Messages the transcription worker posts back to the renderer. */
+export type TranscribeWorkerResponse =
+	| { type: "status"; phase: "model" | "transcribe" }
+	| { type: "result"; segments: CaptionSegment[]; granularity: CaptionTimestampGranularity }
+	| { type: "error"; message: string };
 
 /**
- * ONNX Runtime's wasm bundle treats `process.versions.node` (present in Electron's
- * renderer) as Node and tries `require("fs")`, which Vite does not support. Mask it
- * only while Transformers / ORT run.
+ * Transcribes mono 16 kHz audio into timed caption segments using in-browser Whisper.
+ *
+ * The model load and inference run inside a dedicated Web Worker so the editor's
+ * main thread stays responsive (WASM inference does not yield). The first run
+ * downloads model weights. Aborting (via `options.signal`) terminates the worker
+ * immediately, since model load / inference cannot be cooperatively cancelled.
  */
-function withoutNodeVersion<T>(fn: () => Promise<T>): Promise<T> {
-	const versions =
-		typeof process !== "undefined" && process.versions && typeof process.versions === "object"
-			? process.versions
-			: null;
-	const hadNode = versions !== null && "node" in versions;
-	const savedNode = hadNode ? (versions as { node?: string }).node : undefined;
-	if (hadNode && versions) {
-		try {
-			Reflect.deleteProperty(versions, "node");
-		} catch {
-			(versions as { node?: string }).node = undefined;
-		}
-	}
-	return fn().finally(() => {
-		if (hadNode && versions && savedNode !== undefined) {
-			(versions as { node: string }).node = savedNode;
-		}
-	});
-}
-
-/** Whisper runs with internal 30s chunks; keep each forward pass bounded for WASM memory. */
-const TRANSCRIBE_SLICE_SAMPLES = 12 * 60 * 16_000;
-
-/** Very short slices are skipped in the multi-slice loop unless padded (see `padTailSliceForTranscribe`). */
-const MIN_TRANSCRIBE_SLICE_SAMPLES = 800;
-
-/**
- * Pad a short tail slice so Whisper still runs; timestamps are clamped with `realDurationSec` so
- * padding does not extend perceived audio on the timeline.
- */
-function padTailSliceForTranscribe(samples: Float32Array): {
-	slice: Float32Array;
-	realDurationSec: number;
-} {
-	const realDurationSec = samples.length / 16_000;
-	if (samples.length >= MIN_TRANSCRIBE_SLICE_SAMPLES) {
-		return { slice: samples, realDurationSec };
-	}
-	const padded = new Float32Array(MIN_TRANSCRIBE_SLICE_SAMPLES);
-	padded.set(samples);
-	return { slice: padded, realDurationSec };
-}
-
-function segmentsFromTranscriberChunks(
-	chunks: Array<{ timestamp?: [number | null, number | null]; text?: unknown }>,
-	timeOffsetSec: number,
-	trims: TrimRegion[],
-	audioDurationSec: number,
-): CaptionSegment[] {
-	const sorted = [...chunks].sort((x, y) => {
-		const ax = x.timestamp?.[0];
-		const ay = y.timestamp?.[0];
-		const na = typeof ax === "number" ? ax : -1;
-		const nb = typeof ay === "number" ? ay : -1;
-		return na - nb;
-	});
-
-	const segments: CaptionSegment[] = [];
-
-	for (let idx = 0; idx < sorted.length; idx++) {
-		const c = sorted[idx]!;
-		const ts = c.timestamp as [number | null, number | null] | undefined;
-		if (!ts) continue;
-		let a = ts[0];
-		let b = ts[1];
-		if (a == null) a = 0;
-		a = Math.max(0, a);
-		if (b == null) {
-			let nextStart: number | null = null;
-			for (let j = idx + 1; j < sorted.length; j++) {
-				const na = sorted[j]?.timestamp?.[0];
-				if (typeof na === "number") {
-					nextStart = na;
-					break;
-				}
-			}
-			b = nextStart ?? audioDurationSec;
-		}
-		if (b <= a) {
-			b = Math.min(a + 0.25, audioDurationSec);
-		}
-		b = Math.min(b, audioDurationSec);
-
-		const text = String(c.text ?? "")
-			.replace(/\s+/g, " ")
-			.trim();
-		if (!text) continue;
-
-		const startSec = a + timeOffsetSec;
-		const sliceEnd = timeOffsetSec + audioDurationSec;
-		const endSec = Math.min(Math.max(startSec + 0.08, b + timeOffsetSec), sliceEnd);
-		const startMs = Math.round(startSec * 1000);
-		const endMs = Math.round(endSec * 1000);
-		if (segmentOverlapsTrim(startMs, endMs, trims)) continue;
-
-		segments.push({ startSec, endSec, text });
-	}
-
-	segments.sort((u, v) => u.startSec - v.startSec || u.endSec - v.endSec);
-	const rawDeduped: CaptionSegment[] = [];
-	for (const seg of segments) {
-		const prev = rawDeduped[rawDeduped.length - 1];
-		if (prev && prev.text === seg.text && seg.startSec <= prev.endSec) {
-			prev.endSec = Math.max(prev.endSec, seg.endSec);
-			prev.startSec = Math.min(prev.startSec, seg.startSec);
-			continue;
-		}
-		rawDeduped.push(seg);
-	}
-	return rawDeduped;
-}
-
-async function runTranscriberOnSlice(
-	transcriber: (audio: Float32Array, opts: Record<string, unknown>) => Promise<unknown>,
-	samples: Float32Array,
-	opts: { forceFullSequences: boolean; timestampMode: "word" | "phrase" },
-): Promise<unknown> {
-	const durationSec = samples.length / 16_000;
-	// Only chunk long clips; short-audio chunking regressed some Whisper.js runs (empty chunks).
-	const chunking = durationSec > 30 ? { chunk_length_s: 30, stride_length_s: 5 } : {};
-	return transcriber(samples, {
-		return_timestamps: opts.timestampMode === "word" ? "word" : true,
-		force_full_sequences: opts.forceFullSequences,
-		...chunking,
-	});
-}
-
-function getChunksFromTranscriberResult(result: unknown): Array<{
-	timestamp?: [number | null, number | null];
-	text?: unknown;
-}> {
-	if (result == null) return [];
-	if (Array.isArray(result)) {
-		const out: Array<{ timestamp?: [number | null, number | null]; text?: unknown }> = [];
-		for (const item of result) {
-			const chunks = (item as { chunks?: unknown })?.chunks;
-			if (Array.isArray(chunks)) out.push(...chunks);
-		}
-		return out;
-	}
-	const chunks = (result as { chunks?: unknown })?.chunks;
-	return Array.isArray(chunks) ? chunks : [];
-}
-
-/** Prefer `chunks`; if the model only returned top-level `text`, synthesize one span for timing. */
-function extractChunksFromAsrResult(result: unknown): Array<{
-	timestamp?: [number | null, number | null];
-	text?: unknown;
-}> {
-	const fromChunks = getChunksFromTranscriberResult(result);
-	if (fromChunks.length > 0) return fromChunks;
-	const single = Array.isArray(result) ? result[0] : result;
-	const text =
-		typeof (single as { text?: unknown })?.text === "string"
-			? String((single as { text: string }).text).trim()
-			: "";
-	if (text) {
-		return [{ timestamp: [0, null], text }];
-	}
-	return [];
-}
-
-/**
- * Runs Whisper in-browser via Transformers.js. First run downloads model weights.
- * Long audio is split into slices so one forward pass does not exhaust WASM memory;
- * timestamps are shifted to the full timeline.
- */
-export async function transcribeMono16kToSegments(
+export function transcribeMono16kToSegments(
 	samples: Float32Array,
 	options?: {
 		trimRegions?: TrimRegion[];
@@ -224,112 +42,50 @@ export async function transcribeMono16kToSegments(
 		signal?: AbortSignal;
 	},
 ): Promise<TranscribeMono16kResult> {
-	if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
-
-	const { transcriber } = await withoutNodeVersion(async () => {
-		const { pipeline, env } = await import("@xenova/transformers");
-		env.allowLocalModels = false;
-
-		if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
-
-		options?.onStatus?.("model");
-		// Default tiny weights only: the `output_attentions` revision has regressed inference for
-		// some environments (empty chunks / thrown errors) while phrase mode works on this model.
-		const t = await pipeline("automatic-speech-recognition", "Xenova/whisper-tiny");
-		return { transcriber: t };
-	});
-
-	if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
-
-	await yieldForUiPaint();
-
-	const trims = options?.trimRegions ?? [];
-	options?.onStatus?.("transcribe");
-	if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
-	await yieldForUiPaint();
-	if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
-
-	const transcribeOne = async (
-		ignoreTrims: boolean,
-		forceFullSequences: boolean,
-		timestampMode: "word" | "phrase",
-	): Promise<CaptionSegment[]> => {
-		try {
-			const activeTrims = ignoreTrims ? [] : trims;
-			if (samples.length <= TRANSCRIBE_SLICE_SAMPLES) {
-				const { slice, realDurationSec } = padTailSliceForTranscribe(samples);
-				const result = await runTranscriberOnSlice(transcriber, slice, {
-					forceFullSequences,
-					timestampMode,
-				});
-				if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
-				return segmentsFromTranscriberChunks(
-					extractChunksFromAsrResult(result),
-					0,
-					activeTrims,
-					realDurationSec,
-				);
-			}
-
-			const all: CaptionSegment[] = [];
-			for (let offset = 0; offset < samples.length; offset += TRANSCRIBE_SLICE_SAMPLES) {
-				if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
-				const end = Math.min(offset + TRANSCRIBE_SLICE_SAMPLES, samples.length);
-				const sliceRaw = samples.subarray(offset, end);
-				const isFinalSlice = end >= samples.length;
-				if (sliceRaw.length === 0) continue;
-				if (sliceRaw.length < MIN_TRANSCRIBE_SLICE_SAMPLES && !isFinalSlice) continue;
+	if (options?.signal?.aborted) {
+		return Promise.reject(new DOMException("Aborted", "AbortError"));
+	}
 
-				const { slice, realDurationSec } =
-					sliceRaw.length < MIN_TRANSCRIBE_SLICE_SAMPLES && isFinalSlice
-						? padTailSliceForTranscribe(sliceRaw)
-						: { slice: sliceRaw, realDurationSec: sliceRaw.length / 16_000 };
+	return new Promise<TranscribeMono16kResult>((resolve, reject) => {
+		const worker = new Worker(new URL("./transcribe.worker.ts", import.meta.url), {
+			type: "module",
+		});
 
-				const result = await runTranscriberOnSlice(transcriber, slice, {
-					forceFullSequences,
-					timestampMode,
-				});
-				if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
-				const tOff = offset / 16_000;
-				all.push(
-					...segmentsFromTranscriberChunks(
-						extractChunksFromAsrResult(result),
-						tOff,
-						activeTrims,
-						realDurationSec,
-					),
-				);
+		let settled = false;
+		const finish = (fn: () => void) => {
+			if (settled) return;
+			settled = true;
+			options?.signal?.removeEventListener("abort", onAbort);
+			worker.terminate();
+			fn();
+		};
+
+		const onAbort = () => finish(() => reject(new DOMException("Aborted", "AbortError")));
+		options?.signal?.addEventListener("abort", onAbort, { once: true });
+
+		worker.onmessage = (e: MessageEvent<TranscribeWorkerResponse>) => {
+			const msg = e.data;
+			if (msg.type === "status") {
+				options?.onStatus?.(msg.phase);
+				return;
 			}
-			return all;
-		} catch (e) {
-			if (e instanceof DOMException && e.name === "AbortError") throw e;
-			console.warn("[captioning] Whisper pass failed:", e);
-			return [];
-		}
-	};
-
-	const attemptModes: Array<"word" | "phrase"> = ["word", "phrase"];
-	for (const timestampMode of attemptModes) {
-		let segments = await transcribeOne(false, true, timestampMode);
-		if (segments.length === 0) {
-			segments = await transcribeOne(false, false, timestampMode);
-		}
-		if (segments.length === 0 && trims.length > 0) {
-			segments = dropSegmentsOverlappingTrimRegions(
-				await transcribeOne(true, true, timestampMode),
-				trims,
-			);
-			if (segments.length === 0) {
-				segments = dropSegmentsOverlappingTrimRegions(
-					await transcribeOne(true, false, timestampMode),
-					trims,
-				);
+			if (msg.type === "result") {
+				finish(() => resolve({ segments: msg.segments, granularity: msg.granularity }));
+				return;
 			}
-		}
-		if (segments.length > 0) {
-			return { segments, granularity: timestampMode };
-		}
-	}
-
-	return { segments: [], granularity: "phrase" };
+			finish(() => reject(new Error(msg.message)));
+		};
+
+		worker.onerror = (e) => {
+			finish(() => reject(new Error(e.message || "Caption transcription worker failed")));
+		};
+
+		// Structured-clone copy (not a transfer): the caller may reuse `samples`
+		// for the full-buffer retry pass, so the buffer must stay valid here.
+		const request: TranscribeWorkerRequest = {
+			samples,
+			trimRegions: options?.trimRegions ?? [],
+		};
+		worker.postMessage(request);
+	});
 }
diff --git a/src/lib/captioning/transcribe.worker.ts b/src/lib/captioning/transcribe.worker.ts
new file mode 100644
index 000000000..edd16e8ec
--- /dev/null
+++ b/src/lib/captioning/transcribe.worker.ts
@@ -0,0 +1,81 @@
+/**
+ * Web Worker: runs in-browser Whisper transcription off the renderer's main
+ * thread so the editor UI never blocks while the model loads or audio is
+ * transcribed.
+ *
+ * Input message:  { samples: Float32Array; trimRegions: TrimRegion[] }
+ * Output messages (see `TranscribeWorkerResponse`):
+ *   { type: "status", phase: "model" | "transcribe" }  progress updates
+ *   { type: "result", segments, granularity }          final captions
+ *   { type: "error", message }                          failure detail
+ *
+ * The caller terminates this worker to abort (model load / inference cannot be
+ * cooperatively cancelled), so there is no in-worker abort handling.
+ */
+
+import type { TranscribeWorkerRequest, TranscribeWorkerResponse } from "./transcribe";
+import { runTranscription, type TranscriberFn } from "./transcribeCore";
+
+function post(message: TranscribeWorkerResponse): void {
+	(self as unknown as Worker).postMessage(message);
+}
+
+/**
+ * ONNX Runtime's wasm bundle treats `process.versions.node` (which can leak into
+ * an Electron worker) as Node and tries `require("fs")`, which Vite does not
+ * support. Mask it only while Transformers / ORT run. No-op when `process` is
+ * undefined (the usual case in a Web Worker).
+ */
+function withoutNodeVersion<T>(fn: () => Promise<T>): Promise<T> {
+	const versions =
+		typeof process !== "undefined" && process.versions && typeof process.versions === "object"
+			? process.versions
+			: null;
+	const hadNode = versions !== null && "node" in versions;
+	const savedNode = hadNode ? (versions as { node?: string }).node : undefined;
+	if (hadNode && versions) {
+		try {
+			Reflect.deleteProperty(versions, "node");
+		} catch {
+			(versions as { node?: string }).node = undefined;
+		}
+	}
+	return fn().finally(() => {
+		if (hadNode && versions && savedNode !== undefined) {
+			(versions as { node: string }).node = savedNode;
+		}
+	});
+}
+
+async function loadTranscriber(): Promise<TranscriberFn> {
+	return withoutNodeVersion(async () => {
+		const { pipeline, env } = await import("@xenova/transformers");
+		env.allowLocalModels = false;
+		// Default tiny weights only: the `output_attentions` revision has regressed inference for
+		// some environments (empty chunks / thrown errors) while phrase mode works on this model.
+		const transcriber = (await pipeline(
+			"automatic-speech-recognition",
+			"Xenova/whisper-tiny",
+		)) as unknown as TranscriberFn;
+		return transcriber;
+	});
+}
+
+self.onmessage = async (event: MessageEvent<TranscribeWorkerRequest>) => {
+	const { samples, trimRegions } = event.data;
+	try {
+		post({ type: "status", phase: "model" });
+		const transcriber = await loadTranscriber();
+
+		post({ type: "status", phase: "transcribe" });
+		const { segments, granularity } = await runTranscription(
+			transcriber,
+			samples,
+			trimRegions ?? [],
+		);
+
+		post({ type: "result", segments, granularity });
+	} catch (e) {
+		post({ type: "error", message: e instanceof Error ? e.message : String(e) });
+	}
+};
diff --git a/src/lib/captioning/transcribeCore.ts b/src/lib/captioning/transcribeCore.ts
new file mode 100644
index 000000000..111995246
--- /dev/null
+++ b/src/lib/captioning/transcribeCore.ts
@@ -0,0 +1,269 @@
+import type { TrimRegion } from "@/components/video-editor/types";
+import type { CaptionSegment, TranscribeMono16kResult } from "./transcribe";
+
+/**
+ * Pure transcription algorithm shared by the captioning Web Worker. It takes an
+ * already-constructed Whisper `transcriber` and turns mono 16 kHz audio into
+ * timed caption segments. Kept free of DOM / Transformers.js imports so it can
+ * run inside a worker and be unit-tested in isolation.
+ */
+
+/** A Transformers.js automatic-speech-recognition pipeline call. */
+export type TranscriberFn = (
+	audio: Float32Array,
+	opts: Record<string, unknown>,
+) => Promise<unknown>;
+
+function segmentOverlapsTrim(startMs: number, endMs: number, trims: TrimRegion[]): boolean {
+	return trims.some((t) => startMs < t.endMs && endMs > t.startMs);
+}
+
+/** Same trim-out rule as {@link segmentsFromTranscriberChunks}; for retry passes that used empty trims. */
+function dropSegmentsOverlappingTrimRegions(
+	segments: CaptionSegment[],
+	trimRegions: TrimRegion[],
+): CaptionSegment[] {
+	if (trimRegions.length === 0) return segments;
+	return segments.filter((s) => {
+		const startMs = Math.round(s.startSec * 1000);
+		const endMs = Math.round(s.endSec * 1000);
+		return !segmentOverlapsTrim(startMs, endMs, trimRegions);
+	});
+}
+
+/** Whisper runs with internal 30s chunks; keep each forward pass bounded for WASM memory. */
+const TRANSCRIBE_SLICE_SAMPLES = 12 * 60 * 16_000;
+
+/** Very short slices are skipped in the multi-slice loop unless padded (see `padTailSliceForTranscribe`). */
+const MIN_TRANSCRIBE_SLICE_SAMPLES = 800;
+
+/**
+ * Pad a short tail slice so Whisper still runs; timestamps are clamped with `realDurationSec` so
+ * padding does not extend perceived audio on the timeline.
+ */
+function padTailSliceForTranscribe(samples: Float32Array): {
+	slice: Float32Array;
+	realDurationSec: number;
+} {
+	const realDurationSec = samples.length / 16_000;
+	if (samples.length >= MIN_TRANSCRIBE_SLICE_SAMPLES) {
+		return { slice: samples, realDurationSec };
+	}
+	const padded = new Float32Array(MIN_TRANSCRIBE_SLICE_SAMPLES);
+	padded.set(samples);
+	return { slice: padded, realDurationSec };
+}
+
+/** Converts raw Whisper chunk output into sorted, deduped, trim-filtered caption segments. */
+function segmentsFromTranscriberChunks(
+	chunks: Array<{ timestamp?: [number | null, number | null]; text?: unknown }>,
+	timeOffsetSec: number,
+	trims: TrimRegion[],
+	audioDurationSec: number,
+): CaptionSegment[] {
+	const sorted = [...chunks].sort((x, y) => {
+		const ax = x.timestamp?.[0];
+		const ay = y.timestamp?.[0];
+		const na = typeof ax === "number" ? ax : -1;
+		const nb = typeof ay === "number" ? ay : -1;
+		return na - nb;
+	});
+
+	const segments: CaptionSegment[] = [];
+
+	for (let idx = 0; idx < sorted.length; idx++) {
+		const c = sorted[idx]!;
+		const ts = c.timestamp as [number | null, number | null] | undefined;
+		if (!ts) continue;
+		let a = ts[0];
+		let b = ts[1];
+		if (a == null) a = 0;
+		a = Math.max(0, a);
+		if (b == null) {
+			let nextStart: number | null = null;
+			for (let j = idx + 1; j < sorted.length; j++) {
+				const na = sorted[j]?.timestamp?.[0];
+				if (typeof na === "number") {
+					nextStart = na;
+					break;
+				}
+			}
+			b = nextStart ?? audioDurationSec;
+		}
+		if (b <= a) {
+			b = Math.min(a + 0.25, audioDurationSec);
+		}
+		b = Math.min(b, audioDurationSec);
+
+		const text = String(c.text ?? "")
+			.replace(/\s+/g, " ")
+			.trim();
+		if (!text) continue;
+
+		const startSec = a + timeOffsetSec;
+		const sliceEnd = timeOffsetSec + audioDurationSec;
+		const endSec = Math.min(Math.max(startSec + 0.08, b + timeOffsetSec), sliceEnd);
+		const startMs = Math.round(startSec * 1000);
+		const endMs = Math.round(endSec * 1000);
+		if (segmentOverlapsTrim(startMs, endMs, trims)) continue;
+
+		segments.push({ startSec, endSec, text });
+	}
+
+	segments.sort((u, v) => u.startSec - v.startSec || u.endSec - v.endSec);
+	const rawDeduped: CaptionSegment[] = [];
+	for (const seg of segments) {
+		const prev = rawDeduped[rawDeduped.length - 1];
+		if (prev && prev.text === seg.text && seg.startSec <= prev.endSec) {
+			prev.endSec = Math.max(prev.endSec, seg.endSec);
+			prev.startSec = Math.min(prev.startSec, seg.startSec);
+			continue;
+		}
+		rawDeduped.push(seg);
+	}
+	return rawDeduped;
+}
+
+/** Runs the transcriber on one audio slice, chunking only long clips. */
+async function runTranscriberOnSlice(
+	transcriber: TranscriberFn,
+	samples: Float32Array,
+	opts: { forceFullSequences: boolean; timestampMode: "word" | "phrase" },
+): Promise<unknown> {
+	const durationSec = samples.length / 16_000;
+	// Only chunk long clips; short-audio chunking regressed some Whisper.js runs (empty chunks).
+	const chunking = durationSec > 30 ? { chunk_length_s: 30, stride_length_s: 5 } : {};
+	return transcriber(samples, {
+		return_timestamps: opts.timestampMode === "word" ? "word" : true,
+		force_full_sequences: opts.forceFullSequences,
+		...chunking,
+	});
+}
+
+/** Flattens the various shapes a Transformers.js ASR result can take into a chunk list. */
+function getChunksFromTranscriberResult(result: unknown): Array<{
+	timestamp?: [number | null, number | null];
+	text?: unknown;
+}> {
+	if (result == null) return [];
+	if (Array.isArray(result)) {
+		const out: Array<{ timestamp?: [number | null, number | null]; text?: unknown }> = [];
+		for (const item of result) {
+			const chunks = (item as { chunks?: unknown })?.chunks;
+			if (Array.isArray(chunks)) out.push(...chunks);
+		}
+		return out;
+	}
+	const chunks = (result as { chunks?: unknown })?.chunks;
+	return Array.isArray(chunks) ? chunks : [];
+}
+
+/** Prefer `chunks`; if the model only returned top-level `text`, synthesize one span for timing. */
+function extractChunksFromAsrResult(result: unknown): Array<{
+	timestamp?: [number | null, number | null];
+	text?: unknown;
+}> {
+	const fromChunks = getChunksFromTranscriberResult(result);
+	if (fromChunks.length > 0) return fromChunks;
+	const single = Array.isArray(result) ? result[0] : result;
+	const text =
+		typeof (single as { text?: unknown })?.text === "string"
+			? String((single as { text: string }).text).trim()
+			: "";
+	if (text) {
+		return [{ timestamp: [0, null], text }];
+	}
+	return [];
+}
+
+/**
+ * Drives Whisper over (possibly sliced) mono 16 kHz audio and returns timed segments.
+ * Long audio is split so one forward pass does not exhaust WASM memory; timestamps are
+ * shifted back onto the full timeline. Tries word- then phrase-level timestamps, with a
+ * trim-ignoring retry, before giving up.
+ */
+export async function runTranscription(
+	transcriber: TranscriberFn,
+	samples: Float32Array,
+	trims: TrimRegion[],
+): Promise<TranscribeMono16kResult> {
+	const transcribeOne = async (
+		ignoreTrims: boolean,
+		forceFullSequences: boolean,
+		timestampMode: "word" | "phrase",
+	): Promise<CaptionSegment[]> => {
+		try {
+			const activeTrims = ignoreTrims ? [] : trims;
+			if (samples.length <= TRANSCRIBE_SLICE_SAMPLES) {
+				const { slice, realDurationSec } = padTailSliceForTranscribe(samples);
+				const result = await runTranscriberOnSlice(transcriber, slice, {
+					forceFullSequences,
+					timestampMode,
+				});
+				return segmentsFromTranscriberChunks(
+					extractChunksFromAsrResult(result),
+					0,
+					activeTrims,
+					realDurationSec,
+				);
+			}
+
+			const all: CaptionSegment[] = [];
+			for (let offset = 0; offset < samples.length; offset += TRANSCRIBE_SLICE_SAMPLES) {
+				const end = Math.min(offset + TRANSCRIBE_SLICE_SAMPLES, samples.length);
+				const sliceRaw = samples.subarray(offset, end);
+				const isFinalSlice = end >= samples.length;
+				if (sliceRaw.length === 0) continue;
+				if (sliceRaw.length < MIN_TRANSCRIBE_SLICE_SAMPLES && !isFinalSlice) continue;
+
+				const { slice, realDurationSec } =
+					sliceRaw.length < MIN_TRANSCRIBE_SLICE_SAMPLES && isFinalSlice
+						? padTailSliceForTranscribe(sliceRaw)
+						: { slice: sliceRaw, realDurationSec: sliceRaw.length / 16_000 };
+
+				const result = await runTranscriberOnSlice(transcriber, slice, {
+					forceFullSequences,
+					timestampMode,
+				});
+				const tOff = offset / 16_000;
+				all.push(
+					...segmentsFromTranscriberChunks(
+						extractChunksFromAsrResult(result),
+						tOff,
+						activeTrims,
+						realDurationSec,
+					),
+				);
+			}
+			return all;
+		} catch (e) {
+			console.warn("[captioning] Whisper pass failed:", e);
+			return [];
+		}
+	};
+
+	const attemptModes: Array<"word" | "phrase"> = ["word", "phrase"];
+	for (const timestampMode of attemptModes) {
+		let segments = await transcribeOne(false, true, timestampMode);
+		if (segments.length === 0) {
+			segments = await transcribeOne(false, false, timestampMode);
+		}
+		if (segments.length === 0 && trims.length > 0) {
+			segments = dropSegmentsOverlappingTrimRegions(
+				await transcribeOne(true, true, timestampMode),
+				trims,
+			);
+			if (segments.length === 0) {
+				segments = dropSegmentsOverlappingTrimRegions(
+					await transcribeOne(true, false, timestampMode),
+					trims,
+				);
+			}
+		}
+		if (segments.length > 0) {
+			return { segments, granularity: timestampMode };
+		}
+	}
+
+	return { segments: [], granularity: "phrase" };
+}
diff --git a/vite.config.ts b/vite.config.ts
index 914b307e5..213e44711 100644
--- a/vite.config.ts
+++ b/vite.config.ts
@@ -39,6 +39,11 @@ export default defineConfig({
 	optimizeDeps: {
 		exclude: ["@xenova/transformers"],
 	},
+	// The captioning worker dynamically imports @xenova/transformers, which makes the
+	// worker bundle code-split — unsupported by the default "iife" worker format.
+	worker: {
+		format: "es",
+	},
 	build: {
 		target: "esnext",
 		minify: "terser",