diff --git a/package-lock.json b/package-lock.json
index 50ecc9d88..d4abbfbbe 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -26,6 +26,7 @@
 				"@uiw/color-convert": "^2.10.1",
 				"@uiw/react-color-block": "^2.10.1",
 				"@uiw/react-color-colorful": "^2.9.2",
+				"@xenova/transformers": "^2.17.2",
 				"class-variance-authority": "^0.7.1",
 				"clsx": "^2.1.1",
 				"dnd-timeline": "^2.4.0",
@@ -1772,6 +1773,15 @@
 			"integrity": "sha512-RiB/yIh78pcIxl6lLMG0CgBXAZ2Y0eVHqMPYugu+9U0AeT6YBeiJpf7lbdJNIugFP5SIjwNRgo4DhR1Qxi26Gg==",
 			"license": "MIT"
 		},
+		"node_modules/@huggingface/jinja": {
+			"version": "0.2.2",
+			"resolved": "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.2.2.tgz",
+			"integrity": "sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA==",
+			"license": "MIT",
+			"engines": {
+				"node": ">=18"
+			}
+		},
 		"node_modules/@isaacs/fs-minipass": {
 			"version": "4.0.1",
 			"resolved": "https://registry.npmjs.org/@isaacs/fs-minipass/-/fs-minipass-4.0.1.tgz",
@@ -2104,6 +2114,70 @@
 			"dev": true,
 			"license": "MIT"
 		},
+		"node_modules/@protobufjs/aspromise": {
+			"version": "1.1.2",
+			"resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz",
+			"integrity": "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==",
+			"license": "BSD-3-Clause"
+		},
+		"node_modules/@protobufjs/base64": {
+			"version": "1.1.2",
+			"resolved": "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz",
+			"integrity": "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==",
+			"license": "BSD-3-Clause"
+		},
+		"node_modules/@protobufjs/codegen": {
+			"version": "2.0.5",
+			"resolved": "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.5.tgz",
+			"integrity": "sha512-zgXFLzW3Ap33e6d0Wlj4MGIm6Ce8O89n/apUaGNB/jx+hw+ruWEp7EwGUshdLKVRCxZW12fp9r40E1mQrf/34g==",
+			"license": "BSD-3-Clause"
+		},
+		"node_modules/@protobufjs/eventemitter": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz",
+			"integrity": "sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==",
+			"license": "BSD-3-Clause"
+		},
+		"node_modules/@protobufjs/fetch": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz",
+			"integrity": "sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==",
+			"license": "BSD-3-Clause",
+			"dependencies": {
+				"@protobufjs/aspromise": "^1.1.1",
+				"@protobufjs/inquire": "^1.1.0"
+			}
+		},
+		"node_modules/@protobufjs/float": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz",
+			"integrity": "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==",
+			"license": "BSD-3-Clause"
+		},
+		"node_modules/@protobufjs/inquire": {
+			"version": "1.1.1",
+			"resolved": "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.1.tgz",
+			"integrity": "sha512-mnzgDV26ueAvk7rsbt9L7bE0SuAoqyuys/sMMrmVcN5x9VsxpcG3rqAUSgDyLp0UZlmNfIbQ4fHfCtreVBk8Ew==",
+			"license": "BSD-3-Clause"
+		},
+		"node_modules/@protobufjs/path": {
+			"version": "1.1.2",
+			"resolved": "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz",
+			"integrity": "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==",
+			"license": "BSD-3-Clause"
+		},
+		"node_modules/@protobufjs/pool": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz",
+			"integrity": "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==",
+			"license": "BSD-3-Clause"
+		},
+		"node_modules/@protobufjs/utf8": {
+			"version": "1.1.1",
+			"resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.1.tgz",
+			"integrity": "sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg==",
+			"license": "BSD-3-Clause"
+		},
 		"node_modules/@radix-ui/number": {
 			"version": "1.1.1",
 			"resolved": "https://registry.npmjs.org/@radix-ui/number/-/number-1.1.1.tgz",
@@ -3822,6 +3896,12 @@
 				"@types/node": "*"
 			}
 		},
+		"node_modules/@types/long": {
+			"version": "4.0.2",
+			"resolved": "https://registry.npmjs.org/@types/long/-/long-4.0.2.tgz",
+			"integrity": "sha512-MqTGEo5bj5t157U6fA/BiDynNkn0YknVdh48CMPkTSpFTVmvao5UQmm7uEF6xBEo7qIMAlY/JSleYaE6VOdpaA==",
+			"license": "MIT"
+		},
 		"node_modules/@types/ms": {
 			"version": "2.1.0",
 			"resolved": "https://registry.npmjs.org/@types/ms/-/ms-2.1.0.tgz",
@@ -3833,7 +3913,6 @@
 			"version": "22.19.17",
 			"resolved": "https://registry.npmjs.org/@types/node/-/node-22.19.17.tgz",
 			"integrity": "sha512-wGdMcf+vPYM6jikpS/qhg6WiqSV/OhG+jeeHT/KlVqxYfD40iYJf9/AE1uQxVWFvU7MipKRkRv8NSHiCGgPr8Q==",
-			"dev": true,
 			"license": "MIT",
 			"dependencies": {
 				"undici-types": "~6.21.0"
@@ -4293,6 +4372,20 @@
 			"integrity": "sha512-RPmm6kgRbI8e98zSD3RVACvnuktIja5+yLgDAkTmxLr90BEwdTXRQWNLF3ETTTyH/8mKhznZuN5AveXYFEsMGQ==",
 			"license": "BSD-3-Clause"
 		},
+		"node_modules/@xenova/transformers": {
+			"version": "2.17.2",
+			"resolved": "https://registry.npmjs.org/@xenova/transformers/-/transformers-2.17.2.tgz",
+			"integrity": "sha512-lZmHqzrVIkSvZdKZEx7IYY51TK0WDrC8eR0c5IMnBsO8di8are1zzw8BlLhyO2TklZKLN5UffNGs1IJwT6oOqQ==",
+			"license": "Apache-2.0",
+			"dependencies": {
+				"@huggingface/jinja": "^0.2.2",
+				"onnxruntime-web": "1.14.0",
+				"sharp": "^0.32.0"
+			},
+			"optionalDependencies": {
+				"onnxruntime-node": "1.14.0"
+			}
+		},
 		"node_modules/@xmldom/xmldom": {
 			"version": "0.8.13",
 			"resolved": "https://registry.npmjs.org/@xmldom/xmldom/-/xmldom-0.8.13.tgz",
@@ -4763,11 +4856,101 @@
 				"node": "18 || 20 || >=22"
 			}
 		},
+		"node_modules/bare-events": {
+			"version": "2.8.2",
+			"resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.8.2.tgz",
+			"integrity": "sha512-riJjyv1/mHLIPX4RwiK+oW9/4c3TEUeORHKefKAKnZ5kyslbN+HXowtbaVEqt4IMUB7OXlfixcs6gsFeo/jhiQ==",
+			"license": "Apache-2.0",
+			"peerDependencies": {
+				"bare-abort-controller": "*"
+			},
+			"peerDependenciesMeta": {
+				"bare-abort-controller": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/bare-fs": {
+			"version": "4.7.1",
+			"resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-4.7.1.tgz",
+			"integrity": "sha512-WDRsyVN52eAx/lBamKD6uyw8H4228h/x0sGGGegOamM2cd7Pag88GfMQalobXI+HaEUxpCkbKQUDOQqt9wawRw==",
+			"license": "Apache-2.0",
+			"dependencies": {
+				"bare-events": "^2.5.4",
+				"bare-path": "^3.0.0",
+				"bare-stream": "^2.6.4",
+				"bare-url": "^2.2.2",
+				"fast-fifo": "^1.3.2"
+			},
+			"engines": {
+				"bare": ">=1.16.0"
+			},
+			"peerDependencies": {
+				"bare-buffer": "*"
+			},
+			"peerDependenciesMeta": {
+				"bare-buffer": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/bare-os": {
+			"version": "3.9.1",
+			"resolved": "https://registry.npmjs.org/bare-os/-/bare-os-3.9.1.tgz",
+			"integrity": "sha512-6M5XjcnsygQNPMCMPXSK379xrJFiZ/AEMNBmFEmQW8d/789VQATvriyi5r0HYTL9TkQ26rn3kgdTG3aisbrXkQ==",
+			"license": "Apache-2.0",
+			"engines": {
+				"bare": ">=1.14.0"
+			}
+		},
+		"node_modules/bare-path": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/bare-path/-/bare-path-3.0.0.tgz",
+			"integrity": "sha512-tyfW2cQcB5NN8Saijrhqn0Zh7AnFNsnczRcuWODH0eYAXBsJ5gVxAUuNr7tsHSC6IZ77cA0SitzT+s47kot8Mw==",
+			"license": "Apache-2.0",
+			"dependencies": {
+				"bare-os": "^3.0.1"
+			}
+		},
+		"node_modules/bare-stream": {
+			"version": "2.13.1",
+			"resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.13.1.tgz",
+			"integrity": "sha512-Vp0cnjYyrEC4whYTymQ+YZi6pBpfiICZO3cfRG8sy67ZNWe951urv1x4eW1BKNngw3U+3fPYb5JQvHbCtxH7Ow==",
+			"license": "Apache-2.0",
+			"dependencies": {
+				"streamx": "^2.25.0",
+				"teex": "^1.0.1"
+			},
+			"peerDependencies": {
+				"bare-abort-controller": "*",
+				"bare-buffer": "*",
+				"bare-events": "*"
+			},
+			"peerDependenciesMeta": {
+				"bare-abort-controller": {
+					"optional": true
+				},
+				"bare-buffer": {
+					"optional": true
+				},
+				"bare-events": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/bare-url": {
+			"version": "2.4.3",
+			"resolved": "https://registry.npmjs.org/bare-url/-/bare-url-2.4.3.tgz",
+			"integrity": "sha512-Kccpc7ACfXaxfeInfqKcZtW4pT5YBn1mesc4sCsun6sRwtbJ4h+sNOaksUpYEJUKfN65YWC6Bw2OJEFiKxq8nQ==",
+			"license": "Apache-2.0",
+			"dependencies": {
+				"bare-path": "^3.0.0"
+			}
+		},
 		"node_modules/base64-js": {
 			"version": "1.5.1",
 			"resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz",
 			"integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==",
-			"dev": true,
 			"funding": [
 				{
 					"type": "github",
@@ -4819,6 +5002,17 @@
 				"url": "https://github.com/sponsors/sindresorhus"
 			}
 		},
+		"node_modules/bl": {
+			"version": "4.1.0",
+			"resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz",
+			"integrity": "sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==",
+			"license": "MIT",
+			"dependencies": {
+				"buffer": "^5.5.0",
+				"inherits": "^2.0.4",
+				"readable-stream": "^3.4.0"
+			}
+		},
 		"node_modules/boolean": {
 			"version": "3.2.0",
 			"resolved": "https://registry.npmjs.org/boolean/-/boolean-3.2.0.tgz",
@@ -4891,7 +5085,6 @@
 			"version": "5.7.1",
 			"resolved": "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz",
 			"integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==",
-			"dev": true,
 			"funding": [
 				{
 					"type": "github",
@@ -4907,7 +5100,6 @@
 				}
 			],
 			"license": "MIT",
-			"optional": true,
 			"dependencies": {
 				"base64-js": "^1.3.1",
 				"ieee754": "^1.1.13"
@@ -5306,11 +5498,23 @@
 				"node": ">=6"
 			}
 		},
+		"node_modules/color": {
+			"version": "4.2.3",
+			"resolved": "https://registry.npmjs.org/color/-/color-4.2.3.tgz",
+			"integrity": "sha512-1rXeuUUiGGrykh+CeBdu5Ie7OJwinCgQY0bc7GCRxy5xVHy+moaqkpL/jqQq0MtQOeYcrqEz4abc5f0KtU7W4A==",
+			"license": "MIT",
+			"dependencies": {
+				"color-convert": "^2.0.1",
+				"color-string": "^1.9.0"
+			},
+			"engines": {
+				"node": ">=12.5.0"
+			}
+		},
 		"node_modules/color-convert": {
 			"version": "2.0.1",
 			"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
 			"integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
-			"dev": true,
 			"license": "MIT",
 			"dependencies": {
 				"color-name": "~1.1.4"
@@ -5323,9 +5527,18 @@
 			"version": "1.1.4",
 			"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
 			"integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
-			"dev": true,
 			"license": "MIT"
 		},
+		"node_modules/color-string": {
+			"version": "1.9.1",
+			"resolved": "https://registry.npmjs.org/color-string/-/color-string-1.9.1.tgz",
+			"integrity": "sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==",
+			"license": "MIT",
+			"dependencies": {
+				"color-name": "^1.0.0",
+				"simple-swizzle": "^0.2.2"
+			}
+		},
 		"node_modules/colorette": {
 			"version": "2.0.20",
 			"resolved": "https://registry.npmjs.org/colorette/-/colorette-2.0.20.tgz",
@@ -5529,7 +5742,6 @@
 			"version": "6.0.0",
 			"resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz",
 			"integrity": "sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==",
-			"dev": true,
 			"license": "MIT",
 			"dependencies": {
 				"mimic-response": "^3.1.0"
@@ -5545,7 +5757,6 @@
 			"version": "3.1.0",
 			"resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz",
 			"integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==",
-			"dev": true,
 			"license": "MIT",
 			"engines": {
 				"node": ">=10"
@@ -5554,6 +5765,15 @@
 				"url": "https://github.com/sponsors/sindresorhus"
 			}
 		},
+		"node_modules/deep-extend": {
+			"version": "0.6.0",
+			"resolved": "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz",
+			"integrity": "sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==",
+			"license": "MIT",
+			"engines": {
+				"node": ">=4.0.0"
+			}
+		},
 		"node_modules/defer-to-connect": {
 			"version": "2.0.1",
 			"resolved": "https://registry.npmjs.org/defer-to-connect/-/defer-to-connect-2.0.1.tgz",
@@ -5622,6 +5842,15 @@
 				"node": ">=6"
 			}
 		},
+		"node_modules/detect-libc": {
+			"version": "2.1.2",
+			"resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.1.2.tgz",
+			"integrity": "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ==",
+			"license": "Apache-2.0",
+			"engines": {
+				"node": ">=8"
+			}
+		},
 		"node_modules/detect-node": {
 			"version": "2.1.0",
 			"resolved": "https://registry.npmjs.org/detect-node/-/detect-node-2.1.0.tgz",
@@ -6096,7 +6325,6 @@
 			"version": "1.4.5",
 			"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz",
 			"integrity": "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==",
-			"dev": true,
 			"license": "MIT",
 			"dependencies": {
 				"once": "^1.4.0"
@@ -6289,6 +6517,24 @@
 			"license": "MIT",
 			"peer": true
 		},
+		"node_modules/events-universal": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/events-universal/-/events-universal-1.0.1.tgz",
+			"integrity": "sha512-LUd5euvbMLpwOF8m6ivPCbhQeSiYVNb8Vs0fQ8QjXo0JTkEHpz8pxdQf0gStltaPpw0Cca8b39KxvK9cfKRiAw==",
+			"license": "Apache-2.0",
+			"dependencies": {
+				"bare-events": "^2.7.0"
+			}
+		},
+		"node_modules/expand-template": {
+			"version": "2.0.3",
+			"resolved": "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz",
+			"integrity": "sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==",
+			"license": "(MIT OR WTFPL)",
+			"engines": {
+				"node": ">=6"
+			}
+		},
 		"node_modules/expect-type": {
 			"version": "1.3.0",
 			"resolved": "https://registry.npmjs.org/expect-type/-/expect-type-1.3.0.tgz",
@@ -6368,6 +6614,12 @@
 			"dev": true,
 			"license": "MIT"
 		},
+		"node_modules/fast-fifo": {
+			"version": "1.3.2",
+			"resolved": "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz",
+			"integrity": "sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==",
+			"license": "MIT"
+		},
 		"node_modules/fast-glob": {
 			"version": "3.3.3",
 			"resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.3.tgz",
@@ -6503,6 +6755,12 @@
 			"integrity": "sha512-IKlE+pNvL2R+kVL1kEhUYqRxVqeFnjiIvHWDMLFXNaqyUdFXQM2wte44EfMYJNHkW16X991t2Zg8apKkhv7OBA==",
 			"license": "MIT"
 		},
+		"node_modules/flatbuffers": {
+			"version": "1.12.0",
+			"resolved": "https://registry.npmjs.org/flatbuffers/-/flatbuffers-1.12.0.tgz",
+			"integrity": "sha512-c7CZADjRcl6j0PlvFy0ZqXQ67qSEZfrVPynmnL+2zPc+NtMvrF8Y0QceMo7QqnSPc7+uWjUIAbvCQ5WIKlMVdQ==",
+			"license": "SEE LICENSE IN LICENSE.txt"
+		},
 		"node_modules/form-data": {
 			"version": "4.0.5",
 			"resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.5.tgz",
@@ -6561,6 +6819,12 @@
 				}
 			}
 		},
+		"node_modules/fs-constants": {
+			"version": "1.0.0",
+			"resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz",
+			"integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==",
+			"license": "MIT"
+		},
 		"node_modules/fs-extra": {
 			"version": "8.1.0",
 			"resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-8.1.0.tgz",
@@ -6716,6 +6980,12 @@
 				"js-binary-schema-parser": "^2.0.3"
 			}
 		},
+		"node_modules/github-from-package": {
+			"version": "0.0.0",
+			"resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz",
+			"integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==",
+			"license": "MIT"
+		},
 		"node_modules/glob": {
 			"version": "7.2.3",
 			"resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz",
@@ -6883,6 +7153,12 @@
 			"integrity": "sha512-dMW4CWBTUK1AEEDeZc1g4xpPGIrSf9fJF960qbTZmN/QwZIWY5wgliS6JWl9/25fpTGJrMRtSjGtOmPnfjZB+A==",
 			"license": "Standard 'no charge' license: https://gsap.com/standard-license."
 		},
+		"node_modules/guid-typescript": {
+			"version": "1.0.9",
+			"resolved": "https://registry.npmjs.org/guid-typescript/-/guid-typescript-1.0.9.tgz",
+			"integrity": "sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==",
+			"license": "ISC"
+		},
 		"node_modules/has-flag": {
 			"version": "4.0.0",
 			"resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
@@ -7093,7 +7369,6 @@
 			"version": "1.2.1",
 			"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
 			"integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==",
-			"dev": true,
 			"funding": [
 				{
 					"type": "github",
@@ -7108,8 +7383,7 @@
 					"url": "https://feross.org/support"
 				}
 			],
-			"license": "BSD-3-Clause",
-			"optional": true
+			"license": "BSD-3-Clause"
 		},
 		"node_modules/indent-string": {
 			"version": "4.0.0",
@@ -7137,9 +7411,20 @@
 			"version": "2.0.4",
 			"resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
 			"integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==",
-			"dev": true,
 			"license": "ISC"
 		},
+		"node_modules/ini": {
+			"version": "1.3.8",
+			"resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz",
+			"integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==",
+			"license": "ISC"
+		},
+		"node_modules/is-arrayish": {
+			"version": "0.3.4",
+			"resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.4.tgz",
+			"integrity": "sha512-m6UrgzFVUYawGBh1dUsWR5M2Clqic9RVXC/9f8ceNlv2IcO9j9J/z8UoCLPqtsPBFNzEpfR3xftohbfqDx8EQA==",
+			"license": "MIT"
+		},
 		"node_modules/is-binary-path": {
 			"version": "2.1.0",
 			"resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz",
@@ -7652,6 +7937,12 @@
 				"url": "https://github.com/chalk/slice-ansi?sponsor=1"
 			}
 		},
+		"node_modules/long": {
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/long/-/long-4.0.0.tgz",
+			"integrity": "sha512-XsP+KhQif4bjX1kbuSiySJFNAehNxgLb6hPRGJ9QsUr8ajHkuXGdrHmFUTUUXhDwVX2R5bY4JNZEwbUiMhV+MA==",
+			"license": "Apache-2.0"
+		},
 		"node_modules/loose-envify": {
 			"version": "1.4.0",
 			"resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz",
@@ -7884,7 +8175,6 @@
 			"version": "1.2.8",
 			"resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
 			"integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==",
-			"dev": true,
 			"license": "MIT",
 			"funding": {
 				"url": "https://github.com/sponsors/ljharb"
@@ -7927,6 +8217,12 @@
 				"mkdirp": "bin/cmd.js"
 			}
 		},
+		"node_modules/mkdirp-classic": {
+			"version": "0.5.3",
+			"resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz",
+			"integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==",
+			"license": "MIT"
+		},
 		"node_modules/motion": {
 			"version": "12.38.0",
 			"resolved": "https://registry.npmjs.org/motion/-/motion-12.38.0.tgz",
@@ -8023,6 +8319,12 @@
 				"node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1"
 			}
 		},
+		"node_modules/napi-build-utils": {
+			"version": "2.0.0",
+			"resolved": "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-2.0.0.tgz",
+			"integrity": "sha512-GEbrYkbfF7MoNaoh2iGG84Mnf/WZfB0GdGEsM8wz7Expx/LlWf5U8t9nvJKXSp3qr5IsEbK04cBGhol/KwOsWA==",
+			"license": "MIT"
+		},
 		"node_modules/node-abi": {
 			"version": "4.28.0",
 			"resolved": "https://registry.npmjs.org/node-abi/-/node-abi-4.28.0.tgz",
@@ -8256,7 +8558,6 @@
 			"version": "1.4.0",
 			"resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
 			"integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
-			"dev": true,
 			"license": "ISC",
 			"dependencies": {
 				"wrappy": "1"
@@ -8278,6 +8579,50 @@
 				"url": "https://github.com/sponsors/sindresorhus"
 			}
 		},
+		"node_modules/onnx-proto": {
+			"version": "4.0.4",
+			"resolved": "https://registry.npmjs.org/onnx-proto/-/onnx-proto-4.0.4.tgz",
+			"integrity": "sha512-aldMOB3HRoo6q/phyB6QRQxSt895HNNw82BNyZ2CMh4bjeKv7g/c+VpAFtJuEMVfYLMbRx61hbuqnKceLeDcDA==",
+			"license": "MIT",
+			"dependencies": {
+				"protobufjs": "^6.8.8"
+			}
+		},
+		"node_modules/onnxruntime-common": {
+			"version": "1.14.0",
+			"resolved": "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.14.0.tgz",
+			"integrity": "sha512-3LJpegM2iMNRX2wUmtYfeX/ytfOzNwAWKSq1HbRrKc9+uqG/FsEA0bbKZl1btQeZaXhC26l44NWpNUeXPII7Ew==",
+			"license": "MIT"
+		},
+		"node_modules/onnxruntime-node": {
+			"version": "1.14.0",
+			"resolved": "https://registry.npmjs.org/onnxruntime-node/-/onnxruntime-node-1.14.0.tgz",
+			"integrity": "sha512-5ba7TWomIV/9b6NH/1x/8QEeowsb+jBEvFzU6z0T4mNsFwdPqXeFUM7uxC6QeSRkEbWu3qEB0VMjrvzN/0S9+w==",
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"win32",
+				"darwin",
+				"linux"
+			],
+			"dependencies": {
+				"onnxruntime-common": "~1.14.0"
+			}
+		},
+		"node_modules/onnxruntime-web": {
+			"version": "1.14.0",
+			"resolved": "https://registry.npmjs.org/onnxruntime-web/-/onnxruntime-web-1.14.0.tgz",
+			"integrity": "sha512-Kcqf43UMfW8mCydVGcX9OMXI2VN17c0p6XvR7IPSZzBf/6lteBzXHvcEVWDPmCKuGombl997HgLqj91F11DzXw==",
+			"license": "MIT",
+			"dependencies": {
+				"flatbuffers": "^1.12.0",
+				"guid-typescript": "^1.0.9",
+				"long": "^4.0.0",
+				"onnx-proto": "^4.0.4",
+				"onnxruntime-common": "~1.14.0",
+				"platform": "^1.3.6"
+			}
+		},
 		"node_modules/p-cancelable": {
 			"version": "2.1.1",
 			"resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-2.1.1.tgz",
@@ -8470,6 +8815,12 @@
 			"integrity": "sha512-mlsTRyGaPBjPedk6Bvw+aqbsXDtoAyAzm5MO7JgU+yVRyMQ5O8bD4Kcci7BS85f93veegeCPkL8R4GLClnjLFw==",
 			"license": "MIT"
 		},
+		"node_modules/platform": {
+			"version": "1.3.6",
+			"resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz",
+			"integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==",
+			"license": "MIT"
+		},
 		"node_modules/playwright": {
 			"version": "1.59.1",
 			"resolved": "https://registry.npmjs.org/playwright/-/playwright-1.59.1.tgz",
@@ -8713,6 +9064,91 @@
 				"node": "^12.20.0 || >=14"
 			}
 		},
+		"node_modules/prebuild-install": {
+			"version": "7.1.3",
+			"resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.3.tgz",
+			"integrity": "sha512-8Mf2cbV7x1cXPUILADGI3wuhfqWvtiLA1iclTDbFRZkgRQS0NqsPZphna9V+HyTEadheuPmjaJMsbzKQFOzLug==",
+			"deprecated": "No longer maintained. Please contact the author of the relevant native addon; alternatives are available.",
+			"license": "MIT",
+			"dependencies": {
+				"detect-libc": "^2.0.0",
+				"expand-template": "^2.0.3",
+				"github-from-package": "0.0.0",
+				"minimist": "^1.2.3",
+				"mkdirp-classic": "^0.5.3",
+				"napi-build-utils": "^2.0.0",
+				"node-abi": "^3.3.0",
+				"pump": "^3.0.0",
+				"rc": "^1.2.7",
+				"simple-get": "^4.0.0",
+				"tar-fs": "^2.0.0",
+				"tunnel-agent": "^0.6.0"
+			},
+			"bin": {
+				"prebuild-install": "bin.js"
+			},
+			"engines": {
+				"node": ">=10"
+			}
+		},
+		"node_modules/prebuild-install/node_modules/chownr": {
+			"version": "1.1.4",
+			"resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz",
+			"integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==",
+			"license": "ISC"
+		},
+		"node_modules/prebuild-install/node_modules/node-abi": {
+			"version": "3.92.0",
+			"resolved": "https://registry.npmjs.org/node-abi/-/node-abi-3.92.0.tgz",
+			"integrity": "sha512-KdHvFWZjEKDf0cakgFjebl371GPsISX2oZHcuyKqM7DtogIsHrqKeLTo8wBHxaXRAQlY2PsPlZmfo+9ZCxEREQ==",
+			"license": "MIT",
+			"dependencies": {
+				"semver": "^7.3.5"
+			},
+			"engines": {
+				"node": ">=10"
+			}
+		},
+		"node_modules/prebuild-install/node_modules/semver": {
+			"version": "7.8.0",
+			"resolved": "https://registry.npmjs.org/semver/-/semver-7.8.0.tgz",
+			"integrity": "sha512-AcM7dV/5ul4EekoQ29Agm5vri8JNqRyj39o0qpX6vDF2GZrtutZl5RwgD1XnZjiTAfncsJhMI48QQH3sN87YNA==",
+			"license": "ISC",
+			"bin": {
+				"semver": "bin/semver.js"
+			},
+			"engines": {
+				"node": ">=10"
+			}
+		},
+		"node_modules/prebuild-install/node_modules/tar-fs": {
+			"version": "2.1.4",
+			"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.4.tgz",
+			"integrity": "sha512-mDAjwmZdh7LTT6pNleZ05Yt65HC3E+NiQzl672vQG38jIrehtJk/J3mNwIg+vShQPcLF/LV7CMnDW6vjj6sfYQ==",
+			"license": "MIT",
+			"dependencies": {
+				"chownr": "^1.1.1",
+				"mkdirp-classic": "^0.5.2",
+				"pump": "^3.0.0",
+				"tar-stream": "^2.1.4"
+			}
+		},
+		"node_modules/prebuild-install/node_modules/tar-stream": {
+			"version": "2.2.0",
+			"resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz",
+			"integrity": "sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==",
+			"license": "MIT",
+			"dependencies": {
+				"bl": "^4.0.3",
+				"end-of-stream": "^1.4.1",
+				"fs-constants": "^1.0.0",
+				"inherits": "^2.0.3",
+				"readable-stream": "^3.1.1"
+			},
+			"engines": {
+				"node": ">=6"
+			}
+		},
 		"node_modules/pretty-format": {
 			"version": "27.5.1",
 			"resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-27.5.1.tgz",
@@ -8806,11 +9242,36 @@
 				"signal-exit": "^3.0.2"
 			}
 		},
+		"node_modules/protobufjs": {
+			"version": "6.11.6",
+			"resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-6.11.6.tgz",
+			"integrity": "sha512-k8BHqgPBOtrlougZZqF2uUk5Z7bN8f0wj+3e8M3hvtSv0NBAz4VBy5f6R5Nxq/l+i7mRFTgNZb2trxqTpHNY/A==",
+			"hasInstallScript": true,
+			"license": "BSD-3-Clause",
+			"dependencies": {
+				"@protobufjs/aspromise": "^1.1.2",
+				"@protobufjs/base64": "^1.1.2",
+				"@protobufjs/codegen": "^2.0.4",
+				"@protobufjs/eventemitter": "^1.1.0",
+				"@protobufjs/fetch": "^1.1.0",
+				"@protobufjs/float": "^1.0.2",
+				"@protobufjs/inquire": "^1.1.0",
+				"@protobufjs/path": "^1.1.2",
+				"@protobufjs/pool": "^1.1.0",
+				"@protobufjs/utf8": "^1.1.0",
+				"@types/long": "^4.0.1",
+				"@types/node": ">=13.7.0",
+				"long": "^4.0.0"
+			},
+			"bin": {
+				"pbjs": "bin/pbjs",
+				"pbts": "bin/pbts"
+			}
+		},
 		"node_modules/pump": {
 			"version": "3.0.4",
 			"resolved": "https://registry.npmjs.org/pump/-/pump-3.0.4.tgz",
 			"integrity": "sha512-VS7sjc6KR7e1ukRFhQSY5LM2uBWAUPiOPa/A3mkKmiMwSmRFUITt0xuj+/lesgnCv+dPIEYlkzrcyXgquIHMcA==",
-			"dev": true,
 			"license": "MIT",
 			"dependencies": {
 				"end-of-stream": "^1.1.0",
@@ -8893,6 +9354,21 @@
 				"url": "https://github.com/sponsors/sindresorhus"
 			}
 		},
+		"node_modules/rc": {
+			"version": "1.2.8",
+			"resolved": "https://registry.npmjs.org/rc/-/rc-1.2.8.tgz",
+			"integrity": "sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==",
+			"license": "(BSD-2-Clause OR MIT OR Apache-2.0)",
+			"dependencies": {
+				"deep-extend": "^0.6.0",
+				"ini": "~1.3.0",
+				"minimist": "^1.2.0",
+				"strip-json-comments": "~2.0.1"
+			},
+			"bin": {
+				"rc": "cli.js"
+			}
+		},
 		"node_modules/re-resizable": {
 			"version": "6.11.2",
 			"resolved": "https://registry.npmjs.org/re-resizable/-/re-resizable-6.11.2.tgz",
@@ -9091,6 +9567,20 @@
 				"pify": "^2.3.0"
 			}
 		},
+		"node_modules/readable-stream": {
+			"version": "3.6.2",
+			"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz",
+			"integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==",
+			"license": "MIT",
+			"dependencies": {
+				"inherits": "^2.0.3",
+				"string_decoder": "^1.1.1",
+				"util-deprecate": "^1.0.1"
+			},
+			"engines": {
+				"node": ">= 6"
+			}
+		},
 		"node_modules/readdirp": {
 			"version": "3.6.0",
 			"resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz",
@@ -9373,6 +9863,26 @@
 				"queue-microtask": "^1.2.2"
 			}
 		},
+		"node_modules/safe-buffer": {
+			"version": "5.2.1",
+			"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
+			"integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==",
+			"funding": [
+				{
+					"type": "github",
+					"url": "https://github.com/sponsors/feross"
+				},
+				{
+					"type": "patreon",
+					"url": "https://www.patreon.com/feross"
+				},
+				{
+					"type": "consulting",
+					"url": "https://feross.org/support"
+				}
+			],
+			"license": "MIT"
+		},
 		"node_modules/safer-buffer": {
 			"version": "2.1.2",
 			"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
@@ -9457,6 +9967,47 @@
 				"url": "https://github.com/sponsors/sindresorhus"
 			}
 		},
+		"node_modules/sharp": {
+			"version": "0.32.6",
+			"resolved": "https://registry.npmjs.org/sharp/-/sharp-0.32.6.tgz",
+			"integrity": "sha512-KyLTWwgcR9Oe4d9HwCwNM2l7+J0dUQwn/yf7S0EnTtb0eVS4RxO0eUSvxPtzT4F3SY+C4K6fqdv/DO27sJ/v/w==",
+			"hasInstallScript": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"color": "^4.2.3",
+				"detect-libc": "^2.0.2",
+				"node-addon-api": "^6.1.0",
+				"prebuild-install": "^7.1.1",
+				"semver": "^7.5.4",
+				"simple-get": "^4.0.1",
+				"tar-fs": "^3.0.4",
+				"tunnel-agent": "^0.6.0"
+			},
+			"engines": {
+				"node": ">=14.15.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/libvips"
+			}
+		},
+		"node_modules/sharp/node_modules/node-addon-api": {
+			"version": "6.1.0",
+			"resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-6.1.0.tgz",
+			"integrity": "sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA==",
+			"license": "MIT"
+		},
+		"node_modules/sharp/node_modules/semver": {
+			"version": "7.8.0",
+			"resolved": "https://registry.npmjs.org/semver/-/semver-7.8.0.tgz",
+			"integrity": "sha512-AcM7dV/5ul4EekoQ29Agm5vri8JNqRyj39o0qpX6vDF2GZrtutZl5RwgD1XnZjiTAfncsJhMI48QQH3sN87YNA==",
+			"license": "ISC",
+			"bin": {
+				"semver": "bin/semver.js"
+			},
+			"engines": {
+				"node": ">=10"
+			}
+		},
 		"node_modules/shebang-command": {
 			"version": "2.0.0",
 			"resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz",
@@ -9570,6 +10121,60 @@
 			"dev": true,
 			"license": "ISC"
 		},
+		"node_modules/simple-concat": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz",
+			"integrity": "sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==",
+			"funding": [
+				{
+					"type": "github",
+					"url": "https://github.com/sponsors/feross"
+				},
+				{
+					"type": "patreon",
+					"url": "https://www.patreon.com/feross"
+				},
+				{
+					"type": "consulting",
+					"url": "https://feross.org/support"
+				}
+			],
+			"license": "MIT"
+		},
+		"node_modules/simple-get": {
+			"version": "4.0.1",
+			"resolved": "https://registry.npmjs.org/simple-get/-/simple-get-4.0.1.tgz",
+			"integrity": "sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA==",
+			"funding": [
+				{
+					"type": "github",
+					"url": "https://github.com/sponsors/feross"
+				},
+				{
+					"type": "patreon",
+					"url": "https://www.patreon.com/feross"
+				},
+				{
+					"type": "consulting",
+					"url": "https://feross.org/support"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"decompress-response": "^6.0.0",
+				"once": "^1.3.1",
+				"simple-concat": "^1.0.0"
+			}
+		},
+		"node_modules/simple-swizzle": {
+			"version": "0.2.4",
+			"resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.4.tgz",
+			"integrity": "sha512-nAu1WFPQSMNr2Zn9PGSZK9AGn4t/y97lEm+MXTtUDwfP0ksAIX4nO+6ruD9Jwut4C49SB1Ws+fbXsm/yScWOHw==",
+			"license": "MIT",
+			"dependencies": {
+				"is-arrayish": "^0.3.1"
+			}
+		},
 		"node_modules/simple-update-notifier": {
 			"version": "2.0.0",
 			"resolved": "https://registry.npmjs.org/simple-update-notifier/-/simple-update-notifier-2.0.0.tgz",
@@ -9711,6 +10316,26 @@
 			"dev": true,
 			"license": "MIT"
 		},
+		"node_modules/streamx": {
+			"version": "2.25.0",
+			"resolved": "https://registry.npmjs.org/streamx/-/streamx-2.25.0.tgz",
+			"integrity": "sha512-0nQuG6jf1w+wddNEEXCF4nTg3LtufWINB5eFEN+5TNZW7KWJp6x87+JFL43vaAUPyCfH1wID+mNVyW6OHtFamg==",
+			"license": "MIT",
+			"dependencies": {
+				"events-universal": "^1.0.0",
+				"fast-fifo": "^1.3.2",
+				"text-decoder": "^1.1.0"
+			}
+		},
+		"node_modules/string_decoder": {
+			"version": "1.3.0",
+			"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz",
+			"integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==",
+			"license": "MIT",
+			"dependencies": {
+				"safe-buffer": "~5.2.0"
+			}
+		},
 		"node_modules/string-argv": {
 			"version": "0.3.2",
 			"resolved": "https://registry.npmjs.org/string-argv/-/string-argv-0.3.2.tgz",
@@ -9791,6 +10416,15 @@
 				"node": ">=8"
 			}
 		},
+		"node_modules/strip-json-comments": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz",
+			"integrity": "sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==",
+			"license": "MIT",
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
 		"node_modules/sucrase": {
 			"version": "3.35.1",
 			"resolved": "https://registry.npmjs.org/sucrase/-/sucrase-3.35.1.tgz",
@@ -9949,6 +10583,46 @@
 				"node": ">=18"
 			}
 		},
+		"node_modules/tar-fs": {
+			"version": "3.1.2",
+			"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.1.2.tgz",
+			"integrity": "sha512-QGxxTxxyleAdyM3kpFs14ymbYmNFrfY+pHj7Z8FgtbZ7w2//VAgLMac7sT6nRpIHjppXO2AwwEOg0bPFVRcmXw==",
+			"license": "MIT",
+			"dependencies": {
+				"pump": "^3.0.0",
+				"tar-stream": "^3.1.5"
+			},
+			"optionalDependencies": {
+				"bare-fs": "^4.0.1",
+				"bare-path": "^3.0.0"
+			}
+		},
+		"node_modules/tar-stream": {
+			"version": "3.2.0",
+			"resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-3.2.0.tgz",
+			"integrity": "sha512-ojzvCvVaNp6aOTFmG7jaRD0meowIAuPc3cMMhSgKiVWws1GyHbGd/xvnyuRKcKlMpt3qvxx6r0hreCNITP9hIg==",
+			"license": "MIT",
+			"dependencies": {
+				"b4a": "^1.6.4",
+				"bare-fs": "^4.5.5",
+				"fast-fifo": "^1.2.0",
+				"streamx": "^2.15.0"
+			}
+		},
+		"node_modules/tar-stream/node_modules/b4a": {
+			"version": "1.8.1",
+			"resolved": "https://registry.npmjs.org/b4a/-/b4a-1.8.1.tgz",
+			"integrity": "sha512-aiqre1Nr0B/6DgE2N5vwTc+2/oQZ4Wh1t4NznYY4E00y8LCt6NqdRv81so00oo27D8MVKTpUa/MwUUtBLXCoDw==",
+			"license": "Apache-2.0",
+			"peerDependencies": {
+				"react-native-b4a": "*"
+			},
+			"peerDependenciesMeta": {
+				"react-native-b4a": {
+					"optional": true
+				}
+			}
+		},
 		"node_modules/tar/node_modules/yallist": {
 			"version": "5.0.0",
 			"resolved": "https://registry.npmjs.org/yallist/-/yallist-5.0.0.tgz",
@@ -9959,6 +10633,15 @@
 				"node": ">=18"
 			}
 		},
+		"node_modules/teex": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/teex/-/teex-1.0.1.tgz",
+			"integrity": "sha512-eYE6iEI62Ni1H8oIa7KlDU6uQBtqr4Eajni3wX7rpfXD8ysFx8z0+dri+KWEPWpBsxXfxu58x/0jvTVT1ekOSg==",
+			"license": "MIT",
+			"dependencies": {
+				"streamx": "^2.12.5"
+			}
+		},
 		"node_modules/temp": {
 			"version": "0.9.4",
 			"resolved": "https://registry.npmjs.org/temp/-/temp-0.9.4.tgz",
@@ -10049,6 +10732,29 @@
 			"dev": true,
 			"license": "MIT"
 		},
+		"node_modules/text-decoder": {
+			"version": "1.2.7",
+			"resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.2.7.tgz",
+			"integrity": "sha512-vlLytXkeP4xvEq2otHeJfSQIRyWxo/oZGEbXrtEEF9Hnmrdly59sUbzZ/QgyWuLYHctCHxFF4tRQZNQ9k60ExQ==",
+			"license": "Apache-2.0",
+			"dependencies": {
+				"b4a": "^1.6.4"
+			}
+		},
+		"node_modules/text-decoder/node_modules/b4a": {
+			"version": "1.8.1",
+			"resolved": "https://registry.npmjs.org/b4a/-/b4a-1.8.1.tgz",
+			"integrity": "sha512-aiqre1Nr0B/6DgE2N5vwTc+2/oQZ4Wh1t4NznYY4E00y8LCt6NqdRv81so00oo27D8MVKTpUa/MwUUtBLXCoDw==",
+			"license": "Apache-2.0",
+			"peerDependencies": {
+				"react-native-b4a": "*"
+			},
+			"peerDependenciesMeta": {
+				"react-native-b4a": {
+					"optional": true
+				}
+			}
+		},
 		"node_modules/thenify": {
 			"version": "3.3.1",
 			"resolved": "https://registry.npmjs.org/thenify/-/thenify-3.3.1.tgz",
@@ -10252,6 +10958,18 @@
 			"integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
 			"license": "0BSD"
 		},
+		"node_modules/tunnel-agent": {
+			"version": "0.6.0",
+			"resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz",
+			"integrity": "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==",
+			"license": "Apache-2.0",
+			"dependencies": {
+				"safe-buffer": "^5.0.1"
+			},
+			"engines": {
+				"node": "*"
+			}
+		},
 		"node_modules/type-fest": {
 			"version": "0.13.1",
 			"resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.13.1.tgz",
@@ -10294,7 +11012,6 @@
 			"version": "6.21.0",
 			"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
 			"integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
-			"dev": true,
 			"license": "MIT"
 		},
 		"node_modules/universalify": {
@@ -10806,7 +11523,6 @@
 			"version": "1.0.2",
 			"resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
 			"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
-			"dev": true,
 			"license": "ISC"
 		},
 		"node_modules/ws": {
diff --git a/package.json b/package.json
index fd0c4cf3d..16a013804 100644
--- a/package.json
+++ b/package.json
@@ -63,6 +63,7 @@
 		"@uiw/color-convert": "^2.10.1",
 		"@uiw/react-color-block": "^2.10.1",
 		"@uiw/react-color-colorful": "^2.9.2",
+		"@xenova/transformers": "^2.17.2",
 		"class-variance-authority": "^0.7.1",
 		"clsx": "^2.1.1",
 		"dnd-timeline": "^2.4.0",
diff --git a/src/App.tsx b/src/App.tsx
index 6c36aa8c5..0c8875d04 100644
--- a/src/App.tsx
+++ b/src/App.tsx
@@ -112,7 +112,7 @@ export default function App() {
 	return (
 		<TooltipProvider>
 			{content}
-			<Toaster theme="dark" className="pointer-events-auto" />
+			<Toaster theme="dark" />
 		</TooltipProvider>
 	);
 }
diff --git a/src/components/ui/select.tsx b/src/components/ui/select.tsx
index d151d164e..bdbf64e9a 100644
--- a/src/components/ui/select.tsx
+++ b/src/components/ui/select.tsx
@@ -82,7 +82,8 @@ const SelectContent = React.forwardRef<
 			<SelectPrimitive.Content
 				ref={ref}
 				className={cn(
-					"relative z-50 max-h-96 min-w-[8rem] overflow-hidden rounded-md border bg-popover text-popover-foreground shadow-md data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2",
+					/* Above Dialog (z-[10000]) and fullscreen overlays (e.g. z-[99999]) */
+					"relative z-[100000] max-h-96 min-w-[8rem] overflow-hidden rounded-md border bg-popover text-popover-foreground shadow-md data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2",
 					position === "popper" &&
 						"data-[side=bottom]:translate-y-1 data-[side=left]:-translate-x-1 data-[side=right]:translate-x-1",
 					className,
diff --git a/src/components/ui/sonner.tsx b/src/components/ui/sonner.tsx
index fe3a3906a..3076ac1fe 100644
--- a/src/components/ui/sonner.tsx
+++ b/src/components/ui/sonner.tsx
@@ -1,18 +1,22 @@
 import { Toaster as Sonner } from "sonner";
+import { cn } from "@/lib/utils";
 
 type ToasterProps = React.ComponentProps<typeof Sonner>;
 
-const Toaster = ({ ...props }: ToasterProps) => {
+const Toaster = ({ className, ...props }: ToasterProps) => {
 	return (
 		<Sonner
-			theme="light"
-			className="toaster group"
+			theme="dark"
+			className={cn(
+				"dark toaster group pointer-events-none [&_[data-sonner-toast]]:pointer-events-auto",
+				className,
+			)}
 			duration={3000}
 			toastOptions={{
 				classNames: {
 					toast:
-						"group toast group-[.toaster]:bg-background group-[.toaster]:text-foreground group-[.toaster]:border-border group-[.toaster]:shadow-lg",
-					description: "group-[.toast]:text-muted-foreground",
+						"group toast border border-white/10 bg-[#09090b] text-slate-200 shadow-lg backdrop-blur-xl",
+					description: "group-[.toast]:text-slate-400",
 					actionButton: "group-[.toast]:bg-primary group-[.toast]:text-primary-foreground",
 					cancelButton: "group-[.toast]:bg-muted group-[.toast]:text-muted-foreground",
 				},
diff --git a/src/components/video-editor/VideoEditor.tsx b/src/components/video-editor/VideoEditor.tsx
index 05034632e..dda653274 100644
--- a/src/components/video-editor/VideoEditor.tsx
+++ b/src/components/video-editor/VideoEditor.tsx
@@ -1,8 +1,9 @@
 import type { Span } from "dnd-timeline";
 import { FolderOpen, Languages, Save, Video } from "lucide-react";
-import { useCallback, useEffect, useMemo, useRef, useState } from "react";
+import { type CSSProperties, useCallback, useEffect, useMemo, useRef, useState } from "react";
 import { Panel, PanelGroup, PanelResizeHandle } from "react-resizable-panels";
 import { toast } from "sonner";
+import { Button } from "@/components/ui/button";
 import {
 	Dialog,
 	DialogContent,
@@ -11,11 +12,28 @@ import {
 	DialogHeader,
 	DialogTitle,
 } from "@/components/ui/dialog";
+import { Label } from "@/components/ui/label";
+import {
+	Select,
+	SelectContent,
+	SelectItem,
+	SelectTrigger,
+	SelectValue,
+} from "@/components/ui/select";
 import { useI18n, useScopedT } from "@/contexts/I18nContext";
 import { useShortcuts } from "@/contexts/ShortcutsContext";
 import { INITIAL_EDITOR_STATE, useEditorHistory } from "@/hooks/useEditorHistory";
 import { type Locale } from "@/i18n/config";
 import { getAvailableLocales, getLocaleName } from "@/i18n/loader";
+import {
+	captionSegmentsToAnnotationRegions,
+	extractMono16kFromVideoUrl,
+	MAX_CAPTION_AUDIO_SEC,
+	reconcileAutoCaptionTimelineGaps,
+	shiftTrimRegionsMsForCaptionBuffer,
+	transcribeMono16kToSegments,
+	trimLeadingSilenceMono16k,
+} from "@/lib/captioning";
 import { hasNativeCursorRecordingData } from "@/lib/cursor/nativeCursor";
 import {
 	calculateEffectiveSourceDimensions,
@@ -95,6 +113,9 @@ import {
 import { UnsavedChangesDialog } from "./UnsavedChangesDialog";
 import VideoPlayback, { VideoPlaybackRef } from "./VideoPlayback";
 
+/** Single Sonner slot for auto-caption progress so phases update in place instead of stacking. */
+const AUTO_CAPTION_PROGRESS_TOAST_ID = "auto-caption-progress";
+
 function isClickInteractionType(interactionType: string | null | undefined) {
 	return (
 		interactionType === "click" ||
@@ -151,6 +172,8 @@ function buildSaveDiagnosticMessage(formatLabel: "GIF" | "Video", reason?: strin
 	return `${formatLabel} export save failed${reason ? `\nReason: ${reason}` : ""}`;
 }
 
+const CAPTION_WORD_CHOICES = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] as const;
+
 export default function VideoEditor() {
 	const {
 		state: editorState,
@@ -287,6 +310,11 @@ export default function VideoEditor() {
 
 	const nextAnnotationIdRef = useRef(1);
 	const nextAnnotationZIndexRef = useRef(1);
+	const isAutoCaptioningRef = useRef(false);
+	const [isAutoCaptioning, setIsAutoCaptioning] = useState(false);
+	const [showAutoCaptionsDialog, setShowAutoCaptionsDialog] = useState(false);
+	const [captionWordsMin, setCaptionWordsMin] = useState(2);
+	const [captionWordsMax, setCaptionWordsMax] = useState(7);
 	const exporterRef = useRef<VideoExporter | null>(null);
 
 	const annotationOnlyRegions = useMemo(
@@ -1260,8 +1288,11 @@ export default function VideoEditor() {
 
 	const handleAnnotationSpanChange = useCallback(
 		(id: string, span: Span) => {
-			pushState((prev) => ({
-				annotationRegions: prev.annotationRegions.map((region) =>
+			pushState((prev) => {
+				const editedAutoCaption =
+					prev.annotationRegions.find((region) => region.id === id)?.annotationSource ===
+					"auto-caption";
+				const next = prev.annotationRegions.map((region) =>
 					region.id === id
 						? {
 								...region,
@@ -1269,8 +1300,11 @@ export default function VideoEditor() {
 								endMs: Math.round(span.end),
 							}
 						: region,
-				),
-			}));
+				);
+				return {
+					annotationRegions: editedAutoCaption ? reconcileAutoCaptionTimelineGaps(next) : next,
+				};
+			});
 		},
 		[pushState],
 	);
@@ -1283,8 +1317,10 @@ export default function VideoEditor() {
 				const source = prev.annotationRegions.find((region) => region.id === id);
 				if (!source) return {};
 
+				const { annotationSource: _stripCaptionLink, ...sourceWithoutCaptionLink } = source;
+
 				const duplicate: AnnotationRegion = {
-					...source,
+					...sourceWithoutCaptionLink,
 					id: duplicateId,
 					zIndex: duplicateZIndex,
 					position: { x: source.position.x + 4, y: source.position.y + 4 },
@@ -1375,11 +1411,18 @@ export default function VideoEditor() {
 
 	const handleAnnotationStyleChange = useCallback(
 		(id: string, style: Partial<AnnotationRegion["style"]>) => {
-			pushState((prev) => ({
-				annotationRegions: prev.annotationRegions.map((region) =>
-					region.id === id ? { ...region, style: { ...region.style, ...style } } : region,
-				),
-			}));
+			pushState((prev) => {
+				const touched = prev.annotationRegions.find((r) => r.id === id);
+				const syncAutoCaptions = touched?.annotationSource === "auto-caption";
+				return {
+					annotationRegions: prev.annotationRegions.map((region) => {
+						if (syncAutoCaptions && region.annotationSource === "auto-caption") {
+							return { ...region, style: { ...region.style, ...style } };
+						}
+						return region.id === id ? { ...region, style: { ...region.style, ...style } } : region;
+					}),
+				};
+			});
 		},
 		[pushState],
 	);
@@ -1442,22 +1485,36 @@ export default function VideoEditor() {
 
 	const handleAnnotationPositionChange = useCallback(
 		(id: string, position: { x: number; y: number }) => {
-			pushState((prev) => ({
-				annotationRegions: prev.annotationRegions.map((region) =>
-					region.id === id ? { ...region, position } : region,
-				),
-			}));
+			pushState((prev) => {
+				const moved = prev.annotationRegions.find((r) => r.id === id);
+				const syncAutoCaptions = moved?.annotationSource === "auto-caption";
+				return {
+					annotationRegions: prev.annotationRegions.map((region) => {
+						if (syncAutoCaptions && region.annotationSource === "auto-caption") {
+							return { ...region, position };
+						}
+						return region.id === id ? { ...region, position } : region;
+					}),
+				};
+			});
 		},
 		[pushState],
 	);
 
 	const handleAnnotationSizeChange = useCallback(
 		(id: string, size: { width: number; height: number }) => {
-			pushState((prev) => ({
-				annotationRegions: prev.annotationRegions.map((region) =>
-					region.id === id ? { ...region, size } : region,
-				),
-			}));
+			pushState((prev) => {
+				const resized = prev.annotationRegions.find((r) => r.id === id);
+				const syncAutoCaptions = resized?.annotationSource === "auto-caption";
+				return {
+					annotationRegions: prev.annotationRegions.map((region) => {
+						if (syncAutoCaptions && region.annotationSource === "auto-caption") {
+							return { ...region, size };
+						}
+						return region.id === id ? { ...region, size } : region;
+					}),
+				};
+			});
 		},
 		[pushState],
 	);
@@ -2018,6 +2075,139 @@ export default function VideoEditor() {
 		}
 	}, []);
 
+	const generateAutoCaptions = useCallback(
+		async (minWords: number, maxWords: number) => {
+			if (!videoPath) {
+				toast.error(t("errors.noVideoLoaded"));
+				return;
+			}
+			if (isAutoCaptioningRef.current) {
+				toast.error(t("autoCaptions.busy"));
+				return;
+			}
+			const minW = Math.max(1, Math.min(minWords, maxWords));
+			const maxW = Math.max(minW, maxWords);
+
+			isAutoCaptioningRef.current = true;
+			setIsAutoCaptioning(true);
+			toast.loading(t("autoCaptions.generating"), { id: AUTO_CAPTION_PROGRESS_TOAST_ID });
+			try {
+				const { samples, truncated, durationSec } = await extractMono16kFromVideoUrl(videoPath);
+				if (!Number.isFinite(durationSec) || durationSec <= 0 || samples.length < 800) {
+					toast.dismiss(AUTO_CAPTION_PROGRESS_TOAST_ID);
+					toast.error(t("autoCaptions.noAudio"));
+					return;
+				}
+
+				const { samples: speechSamples, trimSec } = trimLeadingSilenceMono16k(samples);
+				if (speechSamples.length < 800) {
+					toast.dismiss(AUTO_CAPTION_PROGRESS_TOAST_ID);
+					toast.error(t("autoCaptions.noAudio"));
+					return;
+				}
+
+				const trimMs = Math.round(trimSec * 1000);
+				const trimRegionsForTranscribe = shiftTrimRegionsMsForCaptionBuffer(trimRegions, trimMs);
+
+				const transcribeOptions = {
+					onStatus: (phase: "model" | "transcribe") => {
+						if (phase === "model") {
+							toast.loading(t("autoCaptions.loadingModel"), {
+								id: AUTO_CAPTION_PROGRESS_TOAST_ID,
+							});
+						} else {
+							toast.loading(t("autoCaptions.transcribing"), {
+								id: AUTO_CAPTION_PROGRESS_TOAST_ID,
+							});
+						}
+					},
+				};
+
+				let { segments: segmentsRaw, granularity } = await transcribeMono16kToSegments(
+					speechSamples,
+					{
+						trimRegions: trimRegionsForTranscribe,
+						...transcribeOptions,
+					},
+				);
+				let transcribedFromTrimmedBuffer = true;
+
+				// Some recordings come back empty after leading-silence trimming even though the full
+				// source has recognizable speech. Retry once against the untouched audio buffer before
+				// giving up so we do not show "no speech detected" for a spoken clip.
+				if (segmentsRaw.length === 0 && trimSec > 0) {
+					({ segments: segmentsRaw, granularity } = await transcribeMono16kToSegments(samples, {
+						trimRegions,
+						...transcribeOptions,
+					}));
+					transcribedFromTrimmedBuffer = false;
+				}
+
+				const segments =
+					transcribedFromTrimmedBuffer && trimSec > 0
+						? segmentsRaw.map((s) => ({
+								...s,
+								startSec: s.startSec + trimSec,
+								endSec: s.endSec + trimSec,
+							}))
+						: segmentsRaw;
+
+				let { regions, nextNumericId, nextZIndex } = captionSegmentsToAnnotationRegions(
+					segments,
+					nextAnnotationIdRef.current,
+					nextAnnotationZIndexRef.current,
+					{
+						minWordsPerCaption: minW,
+						maxWordsPerCaption: maxW,
+						timestampGranularity: granularity,
+					},
+				);
+
+				if (regions.length === 0 && segments.length > 0) {
+					({ regions, nextNumericId, nextZIndex } = captionSegmentsToAnnotationRegions(
+						segments,
+						nextAnnotationIdRef.current,
+						nextAnnotationZIndexRef.current,
+						{
+							minWordsPerCaption: 1,
+							maxWordsPerCaption: Number.MAX_SAFE_INTEGER,
+							timestampGranularity: granularity,
+						},
+					));
+				}
+
+				if (regions.length === 0) {
+					toast.dismiss(AUTO_CAPTION_PROGRESS_TOAST_ID);
+					toast.info(t("autoCaptions.noneHeard"));
+					return;
+				}
+
+				pushState((prev) => ({ annotationRegions: [...prev.annotationRegions, ...regions] }));
+				nextAnnotationIdRef.current = nextNumericId;
+				nextAnnotationZIndexRef.current = nextZIndex;
+
+				toast.dismiss(AUTO_CAPTION_PROGRESS_TOAST_ID);
+				const minutesTrunc = String(Math.round(MAX_CAPTION_AUDIO_SEC / 60));
+				if (truncated) {
+					toast.success(t("autoCaptions.done", { count: String(regions.length) }), {
+						description: t("autoCaptions.truncated", { minutes: minutesTrunc }),
+					});
+				} else {
+					toast.success(t("autoCaptions.done", { count: String(regions.length) }));
+				}
+			} catch (e) {
+				console.error(e);
+				toast.dismiss(AUTO_CAPTION_PROGRESS_TOAST_ID);
+				const detail = e instanceof Error ? e.message : String(e);
+				toast.error(t("autoCaptions.failed"), { description: detail });
+			} finally {
+				isAutoCaptioningRef.current = false;
+				setIsAutoCaptioning(false);
+			}
+		},
+		[videoPath, trimRegions, pushState, t],
+	);
+
 	const handleSaveDiagnostic = useCallback(async () => {
 		const result = await window.electronAPI.saveDiagnostic({
 			error: exportError ?? "Manual diagnostic export",
@@ -2060,7 +2250,7 @@ export default function VideoEditor() {
 			<Dialog open={showNewRecordingDialog} onOpenChange={setShowNewRecordingDialog}>
 				<DialogContent
 					className="sm:max-w-[425px]"
-					style={{ WebkitAppRegion: "no-drag" } as React.CSSProperties}
+					style={{ WebkitAppRegion: "no-drag" } as CSSProperties}
 				>
 					<DialogHeader>
 						<DialogTitle>{t("newRecording.title")}</DialogTitle>
@@ -2085,13 +2275,92 @@ export default function VideoEditor() {
 				</DialogContent>
 			</Dialog>
 
+			<Dialog open={showAutoCaptionsDialog} onOpenChange={setShowAutoCaptionsDialog}>
+				<DialogContent
+					className="sm:max-w-md"
+					style={{ WebkitAppRegion: "no-drag" } as CSSProperties}
+				>
+					<DialogHeader>
+						<DialogTitle>{t("autoCaptions.dialogTitle")}</DialogTitle>
+						<DialogDescription>{t("autoCaptions.dialogDescription")}</DialogDescription>
+					</DialogHeader>
+					<div className="grid gap-4 py-2">
+						<div className="grid gap-2">
+							<Label htmlFor="caption-min-words">{t("autoCaptions.minWords")}</Label>
+							<Select
+								value={String(captionWordsMin)}
+								onValueChange={(v) => {
+									const n = Number.parseInt(v, 10);
+									setCaptionWordsMin(n);
+									if (n > captionWordsMax) setCaptionWordsMax(n);
+								}}
+							>
+								<SelectTrigger id="caption-min-words" className="h-9">
+									<SelectValue />
+								</SelectTrigger>
+								<SelectContent>
+									{CAPTION_WORD_CHOICES.map((n) => (
+										<SelectItem key={`min-${n}`} value={String(n)}>
+											{t("autoCaptions.wordsCount", { count: String(n) })}
+										</SelectItem>
+									))}
+								</SelectContent>
+							</Select>
+						</div>
+						<div className="grid gap-2">
+							<Label htmlFor="caption-max-words">{t("autoCaptions.maxWords")}</Label>
+							<Select
+								value={String(captionWordsMax)}
+								onValueChange={(v) => {
+									const n = Number.parseInt(v, 10);
+									setCaptionWordsMax(n);
+									if (n < captionWordsMin) setCaptionWordsMin(n);
+								}}
+							>
+								<SelectTrigger id="caption-max-words" className="h-9">
+									<SelectValue />
+								</SelectTrigger>
+								<SelectContent>
+									{CAPTION_WORD_CHOICES.map((n) => (
+										<SelectItem key={`max-${n}`} value={String(n)}>
+											{t("autoCaptions.wordsCount", { count: String(n) })}
+										</SelectItem>
+									))}
+								</SelectContent>
+							</Select>
+						</div>
+					</div>
+					<DialogFooter className="gap-2 sm:gap-0">
+						<Button
+							type="button"
+							variant="outline"
+							onClick={() => setShowAutoCaptionsDialog(false)}
+							className="border-white/20 bg-transparent text-white hover:bg-white/10"
+						>
+							{t("autoCaptions.dialogCancel")}
+						</Button>
+						<Button
+							type="button"
+							disabled={isAutoCaptioning}
+							onClick={() => {
+								setShowAutoCaptionsDialog(false);
+								void generateAutoCaptions(captionWordsMin, captionWordsMax);
+							}}
+							className="bg-[#34B27B] text-white hover:bg-[#34B27B]/90"
+						>
+							{t("autoCaptions.generate")}
+						</Button>
+					</DialogFooter>
+				</DialogContent>
+			</Dialog>
+
 			<div
 				className="h-11 flex-shrink-0 bg-[#070809]/85 backdrop-blur-xl border-b border-white/[0.07] flex items-center justify-between px-5 z-50 shadow-[0_1px_0_rgba(255,255,255,0.03)]"
-				style={{ WebkitAppRegion: "drag" } as React.CSSProperties}
+				style={{ WebkitAppRegion: "drag" } as CSSProperties}
 			>
 				<div
 					className="flex-1 flex items-center gap-1"
-					style={{ WebkitAppRegion: "no-drag" } as React.CSSProperties}
+					style={{ WebkitAppRegion: "no-drag" } as CSSProperties}
 				>
 					<div
 						className={`flex items-center gap-1.5 px-2.5 py-1.5 rounded-lg text-white/50 hover:text-white/90 hover:bg-white/[0.08] transition-all duration-150 ${isMac ? "ml-14" : "ml-2"}`}
@@ -2489,6 +2758,19 @@ export default function VideoEditor() {
 									}
 									videoUrl={videoPath ?? undefined}
 									showTrimWaveform={showTrimWaveform}
+									captionsLabel={t("autoCaptions.button")}
+									isGeneratingCaptions={isAutoCaptioning}
+									onGenerateCaptions={() => {
+										if (!videoPath) {
+											toast.error(t("errors.noVideoLoaded"));
+											return;
+										}
+										if (isAutoCaptioningRef.current) {
+											toast.error(t("autoCaptions.busy"));
+											return;
+										}
+										setShowAutoCaptionsDialog(true);
+									}}
 								/>
 							</div>
 						</Panel>
diff --git a/src/components/video-editor/projectPersistence.ts b/src/components/video-editor/projectPersistence.ts
index ff59427f2..1fefa43e9 100644
--- a/src/components/video-editor/projectPersistence.ts
+++ b/src/components/video-editor/projectPersistence.ts
@@ -333,6 +333,8 @@ export function normalizeProjectEditor(editor: Partial<ProjectEditorState>): Pro
 						content: typeof region.content === "string" ? region.content : "",
 						textContent: typeof region.textContent === "string" ? region.textContent : undefined,
 						imageContent: typeof region.imageContent === "string" ? region.imageContent : undefined,
+						annotationSource:
+							region.annotationSource === "auto-caption" ? ("auto-caption" as const) : undefined,
 						position: {
 							x: clamp(
 								isFiniteNumber(region.position?.x)
diff --git a/src/components/video-editor/timeline/TimelineEditor.tsx b/src/components/video-editor/timeline/TimelineEditor.tsx
index f84d038a9..65ebd8bdb 100644
--- a/src/components/video-editor/timeline/TimelineEditor.tsx
+++ b/src/components/video-editor/timeline/TimelineEditor.tsx
@@ -1,6 +1,7 @@
 import type { Range, Span } from "dnd-timeline";
 import { useTimelineContext } from "dnd-timeline";
 import {
+	Captions,
 	Check,
 	ChevronDown,
 	Gauge,
@@ -92,6 +93,11 @@ interface TimelineEditorProps {
 	onAspectRatioChange: (aspectRatio: AspectRatio) => void;
 	videoUrl?: string;
 	showTrimWaveform?: boolean;
+	/** Opens the auto-captions flow. When omitted, the captions button is hidden. */
+	onGenerateCaptions?: () => void;
+	isGeneratingCaptions?: boolean;
+	/** Localized label for the auto-captions button (lives in the `editor` namespace). */
+	captionsLabel?: string;
 }
 
 interface TimelineScaleConfig {
@@ -924,6 +930,9 @@ export default function TimelineEditor({
 	onAspectRatioChange,
 	videoUrl,
 	showTrimWaveform = false,
+	onGenerateCaptions,
+	isGeneratingCaptions = false,
+	captionsLabel,
 }: TimelineEditorProps) {
 	const t = useScopedT("timeline");
 	const totalMs = useMemo(() => Math.max(0, Math.round(videoDuration * 1000)), [videoDuration]);
@@ -1659,6 +1668,18 @@ export default function TimelineEditor({
 					>
 						<Gauge className="w-4 h-4" />
 					</Button>
+					{onGenerateCaptions && (
+						<Button
+							onClick={onGenerateCaptions}
+							disabled={isGeneratingCaptions || !videoUrl}
+							variant="ghost"
+							size="icon"
+							className="h-7 w-7 rounded-lg text-slate-400 hover:text-[#a78bfa] hover:bg-[#a78bfa]/10 transition-all"
+							title={captionsLabel}
+						>
+							<Captions className="w-4 h-4" />
+						</Button>
+					)}
 				</div>
 				<div className="flex items-center gap-1.5 min-w-0">
 					<DropdownMenu>
diff --git a/src/components/video-editor/types.ts b/src/components/video-editor/types.ts
index 0f2267cca..1aca90af1 100644
--- a/src/components/video-editor/types.ts
+++ b/src/components/video-editor/types.ts
@@ -288,6 +288,8 @@ export interface AnnotationRegion {
 	size: AnnotationSize;
 	style: AnnotationTextStyle;
 	zIndex: number;
+	/** When set, layout/style edits on one region can sync to all auto-caption siblings. */
+	annotationSource?: "auto-caption";
 	figureData?: FigureData;
 	blurData?: BlurData;
 }
diff --git a/src/i18n/locales/ar/editor.json b/src/i18n/locales/ar/editor.json
index b3e122280..39750e5eb 100644
--- a/src/i18n/locales/ar/editor.json
+++ b/src/i18n/locales/ar/editor.json
@@ -44,6 +44,25 @@
 		"permissionDenied": "تم رفض إذن التسجيل. يرجى السماح بتسجيل الشاشة.",
 		"accessibilityAllowAndRetry": "اسمح بوصول تسهيلات الاستخدام لـ OpenScreen، ثم اضغط على التسجيل مرة أخرى لبدء العد التنازلي."
 	},
+	"autoCaptions": {
+		"button": "التسميات التوضيحية التلقائية",
+		"dialogTitle": "التسميات التوضيحية التلقائية",
+		"dialogDescription": "اختر تقريبا كم عدد الكلمات التي تظهر في كل تسمية توضيحية. يتم توزيع التوقيت عبر الكلمات في تلك العبارة.",
+		"minWords": "الحد الأدنى من الكلمات لكل تسمية",
+		"maxWords": "الحد الأقصى من الكلمات لكل تسمية",
+		"wordsCount": "{{count}} كلمة",
+		"generate": "توليد",
+		"dialogCancel": "إلغاء",
+		"generating": "جارٍ توليد التسميات من الصوت…",
+		"loadingModel": "جارٍ تحميل نموذج الكلام (سيتم تنزيل ~75 ميغابايت عند الاستخدام الأول)…",
+		"transcribing": "جارٍ نسخ الكلام إلى نص…",
+		"busy": "توليد التسميات قيد التنفيذ بالفعل.",
+		"done": "تمت إضافة {{count}} تسمية.",
+		"noneHeard": "لم يتم الكشف عن أي كلام.",
+		"noAudio": "لا يحتوي هذا الفيديو على صوت صالح للنسخ.",
+		"failed": "تعذّر توليد التسميات.",
+		"truncated": "تم نسخ الدقائق الأولى فقط: {{minutes}} دقيقة."
+	},
 	"emptyState": {
 		"title": "لا يوجد مشروع مفتوح",
 		"description": "استورد مقطع فيديو للبدء في التحرير، أو حمّل مشروع OpenScreen موجود.",
diff --git a/src/i18n/locales/en/editor.json b/src/i18n/locales/en/editor.json
index ebd9a5d5f..d6a56f033 100644
--- a/src/i18n/locales/en/editor.json
+++ b/src/i18n/locales/en/editor.json
@@ -44,6 +44,25 @@
 		"permissionDenied": "Recording permission denied. Please allow screen recording.",
 		"accessibilityAllowAndRetry": "Allow Accessibility access for OpenScreen, then press record again to start the countdown."
 	},
+	"autoCaptions": {
+		"button": "Auto captions",
+		"dialogTitle": "Auto captions",
+		"dialogDescription": "Choose roughly how many words each caption shows at once. Timing is spread across the words in that phrase.",
+		"minWords": "Minimum words per caption",
+		"maxWords": "Maximum words per caption",
+		"wordsCount": "{{count}} words",
+		"generate": "Generate",
+		"dialogCancel": "Cancel",
+		"generating": "Generating captions from audio…",
+		"loadingModel": "Loading speech model (first use downloads ~75 MB)…",
+		"transcribing": "Transcribing speech…",
+		"busy": "Caption generation is already in progress.",
+		"done": "Added {{count}} captions.",
+		"noneHeard": "No speech was detected.",
+		"noAudio": "This video has no usable audio to transcribe.",
+		"failed": "Could not generate captions.",
+		"truncated": "Only the first {{minutes}} minutes were transcribed."
+	},
 	"emptyState": {
 		"title": "No project open",
 		"description": "Import a video to start editing, or load an existing OpenScreen project.",
diff --git a/src/i18n/locales/es/editor.json b/src/i18n/locales/es/editor.json
index 16a2c8547..277ce40ff 100644
--- a/src/i18n/locales/es/editor.json
+++ b/src/i18n/locales/es/editor.json
@@ -44,6 +44,25 @@
 		"cancel": "Cancelar",
 		"confirm": "Confirmar"
 	},
+	"autoCaptions": {
+		"button": "Subtítulos automáticos",
+		"dialogTitle": "Subtítulos automáticos",
+		"dialogDescription": "Elige aproximadamente cuántas palabras muestra cada subtítulo a la vez. El tiempo se reparte entre las palabras de esa frase.",
+		"minWords": "Número mínimo de palabras por subtítulo",
+		"maxWords": "Número máximo de palabras por subtítulo",
+		"wordsCount": "{{count}} palabras",
+		"generate": "Generar",
+		"dialogCancel": "Cancelar",
+		"generating": "Generando subtítulos a partir del audio…",
+		"loadingModel": "Cargando el modelo de voz (el primer uso descarga ~75 MB)…",
+		"transcribing": "Transcribiendo el habla…",
+		"busy": "La generación de subtítulos ya está en curso.",
+		"done": "Se añadieron {{count}} subtítulos.",
+		"noneHeard": "No se detectó voz.",
+		"noAudio": "Este video no tiene audio utilizable para transcribir.",
+		"failed": "No se pudieron generar los subtítulos.",
+		"truncated": "Solo se transcribieron los primeros {{minutes}} minutos."
+	},
 	"emptyState": {
 		"title": "No hay proyecto abierto",
 		"description": "Importa un video para empezar a editar o carga un proyecto de OpenScreen existente.",
diff --git a/src/i18n/locales/fr/editor.json b/src/i18n/locales/fr/editor.json
index 4eb57a9cc..40dc24fd7 100644
--- a/src/i18n/locales/fr/editor.json
+++ b/src/i18n/locales/fr/editor.json
@@ -44,6 +44,25 @@
 	},
 	"loadingVideo": "Chargement de la vidéo...",
 	"loadingEditor": "Chargement de l'éditeur...",
+	"autoCaptions": {
+		"button": "Sous-titres automatiques",
+		"dialogTitle": "Sous-titres automatiques",
+		"dialogDescription": "Choisissez approximativement combien de mots chaque sous-titre affiche à la fois. Le timing est réparti entre les mots de cette phrase.",
+		"minWords": "Nombre minimum de mots par sous-titre",
+		"maxWords": "Nombre maximum de mots par sous-titre",
+		"wordsCount": "{{count}} mots",
+		"generate": "Générer",
+		"dialogCancel": "Annuler",
+		"generating": "Génération des sous-titres à partir de l'audio…",
+		"loadingModel": "Chargement du modèle vocal (le premier usage télécharge ~75 MB)…",
+		"transcribing": "Transcription de la parole…",
+		"busy": "La génération des sous-titres est déjà en cours.",
+		"done": "{{count}} sous-titres ajoutés.",
+		"noneHeard": "Aucune parole n'a été détectée.",
+		"noAudio": "Cette vidéo ne contient pas d'audio exploitable pour la transcription.",
+		"failed": "Impossible de générer les sous-titres.",
+		"truncated": "Seules les {{minutes}} premières minutes ont été transcrites."
+	},
 	"emptyState": {
 		"title": "Aucun projet ouvert",
 		"description": "Importez une vidéo pour commencer à éditer, ou chargez un projet OpenScreen existant.",
diff --git a/src/i18n/locales/it/editor.json b/src/i18n/locales/it/editor.json
index 336d3e6ba..0e94b9a9f 100644
--- a/src/i18n/locales/it/editor.json
+++ b/src/i18n/locales/it/editor.json
@@ -42,5 +42,24 @@
 		"cameraNotFound": "Fotocamera non trovata.",
 		"permissionDenied": "Autorizzazione di registrazione negata. Consenti la registrazione dello schermo.",
 		"accessibilityAllowAndRetry": "Consenti l'accesso all'accessibilità per OpenScreen, poi premi di nuovo registra per avviare il conto alla rovescia."
+	},
+	"autoCaptions": {
+		"button": "Sottotitoli automatici",
+		"dialogTitle": "Sottotitoli automatici",
+		"dialogDescription": "Scegli all'incirca quante parole mostrare per ogni sottotitolo. La temporizzazione viene distribuita tra le parole della frase.",
+		"minWords": "Numero minimo di parole per sottotitolo",
+		"maxWords": "Numero massimo di parole per sottotitolo",
+		"wordsCount": "{{count}} parole",
+		"generate": "Genera",
+		"dialogCancel": "Annulla",
+		"generating": "Generazione dei sottotitoli dall'audio…",
+		"loadingModel": "Caricamento del modello vocale (al primo utilizzo vengono scaricati ~75 MB)…",
+		"transcribing": "Trascrizione del parlato…",
+		"busy": "La generazione dei sottotitoli è già in corso.",
+		"done": "Aggiunti {{count}} sottotitoli.",
+		"noneHeard": "Nessun parlato rilevato.",
+		"noAudio": "Questo video non contiene audio utilizzabile per la trascrizione.",
+		"failed": "Impossibile generare i sottotitoli.",
+		"truncated": "Sono stati trascritti solo i primi {{minutes}} minuti."
 	}
 }
diff --git a/src/i18n/locales/ja-JP/editor.json b/src/i18n/locales/ja-JP/editor.json
index 5151d1054..8e0da42e1 100644
--- a/src/i18n/locales/ja-JP/editor.json
+++ b/src/i18n/locales/ja-JP/editor.json
@@ -44,6 +44,25 @@
 		"cameraNotFound": "カメラが見つかりません。",
 		"accessibilityAllowAndRetry": "OpenScreenにアクセシビリティアクセスを許可してから、もう一度録画を押してカウントダウンを開始してください。"
 	},
+	"autoCaptions": {
+		"button": "自動キャプション",
+		"dialogTitle": "自動キャプション",
+		"dialogDescription": "各キャプションに一度に表示する語数の目安を選びます。タイミングはそのフレーズ内の語に分配されます。",
+		"minWords": "キャプションあたりの最小語数",
+		"maxWords": "キャプションあたりの最大語数",
+		"wordsCount": "{{count}} 語",
+		"generate": "生成",
+		"dialogCancel": "キャンセル",
+		"generating": "音声からキャプションを生成しています…",
+		"loadingModel": "音声モデルを読み込んでいます（初回利用時は約 75 MB をダウンロードします）…",
+		"transcribing": "音声を文字起こししています…",
+		"busy": "キャプションの生成はすでに実行中です。",
+		"done": "{{count}} 件のキャプションを追加しました。",
+		"noneHeard": "音声が検出されませんでした。",
+		"noAudio": "この動画には書き起こしに使える音声がありません。",
+		"failed": "キャプションを生成できませんでした。",
+		"truncated": "最初の {{minutes}} 分のみが書き起こされました。"
+	},
 	"emptyState": {
 		"title": "プロジェクトが開かれていません",
 		"description": "動画をインポートして編集を開始するか、既存の OpenScreen プロジェクトを読み込んでください。",
diff --git a/src/i18n/locales/ko-KR/editor.json b/src/i18n/locales/ko-KR/editor.json
index 23990c386..a63a22a57 100644
--- a/src/i18n/locales/ko-KR/editor.json
+++ b/src/i18n/locales/ko-KR/editor.json
@@ -44,6 +44,25 @@
 		"cameraNotFound": "카메라를 찾을 수 없습니다.",
 		"accessibilityAllowAndRetry": "OpenScreen의 손쉬운 사용 접근을 허용한 다음, 카운트다운을 시작하려면 다시 녹화를 누르세요."
 	},
+	"autoCaptions": {
+		"button": "자동 자막",
+		"dialogTitle": "자동 자막",
+		"dialogDescription": "각 자막에 한 번에 표시할 단어 수의 대략적인 값을 선택하세요. 타이밍은 해당 구문의 단어들에 나뉩니다.",
+		"minWords": "자막당 최소 단어 수",
+		"maxWords": "자막당 최대 단어 수",
+		"wordsCount": "{{count}}개 단어",
+		"generate": "생성",
+		"dialogCancel": "취소",
+		"generating": "오디오에서 자막을 생성하는 중…",
+		"loadingModel": "음성 모델을 불러오는 중(첫 사용 시 약 75MB 다운로드)…",
+		"transcribing": "음성을 전사하는 중…",
+		"busy": "자막 생성이 이미 진행 중입니다.",
+		"done": "자막 {{count}}개를 추가했습니다.",
+		"noneHeard": "음성이 감지되지 않았습니다.",
+		"noAudio": "이 동영상에는 전사에 사용할 수 있는 음성이 없습니다.",
+		"failed": "자막을 생성할 수 없습니다.",
+		"truncated": "처음 {{minutes}}분만 전사되었습니다."
+	},
 	"emptyState": {
 		"title": "열린 프로젝트 없음",
 		"description": "동영상을 가져와 편집을 시작하거나 기존 OpenScreen 프로젝트를 불러오세요.",
diff --git a/src/i18n/locales/pt-BR/editor.json b/src/i18n/locales/pt-BR/editor.json
index 7e3f69531..b0e9ab8c9 100644
--- a/src/i18n/locales/pt-BR/editor.json
+++ b/src/i18n/locales/pt-BR/editor.json
@@ -41,5 +41,24 @@
 		"cameraDisconnected": "Webcam desconectada.",
 		"cameraNotFound": "Câmera não encontrada.",
 		"permissionDenied": "Permissão de gravação negada. Por favor, permita a gravação de tela."
+	},
+	"autoCaptions": {
+		"button": "Legendas automáticas",
+		"dialogTitle": "Legendas automáticas",
+		"dialogDescription": "Escolha aproximadamente quantas palavras cada legenda mostra de cada vez. O tempo é distribuído entre as palavras da frase.",
+		"minWords": "Mínimo de palavras por legenda",
+		"maxWords": "Máximo de palavras por legenda",
+		"wordsCount": "{{count}} palavras",
+		"generate": "Gerar",
+		"dialogCancel": "Cancelar",
+		"generating": "Gerando legendas a partir do áudio…",
+		"loadingModel": "Carregando o modelo de fala (o primeiro uso baixa ~75 MB)…",
+		"transcribing": "Transcrevendo a fala…",
+		"busy": "A geração de legendas já está em andamento.",
+		"done": "{{count}} legendas adicionadas.",
+		"noneHeard": "Nenhuma fala foi detectada.",
+		"noAudio": "Este vídeo não tem áudio utilizável para transcrição.",
+		"failed": "Não foi possível gerar as legendas.",
+		"truncated": "Apenas os primeiros {{minutes}} minutos foram transcritos."
 	}
 }
diff --git a/src/i18n/locales/ru/editor.json b/src/i18n/locales/ru/editor.json
index ff0c80b8b..78fa129a1 100644
--- a/src/i18n/locales/ru/editor.json
+++ b/src/i18n/locales/ru/editor.json
@@ -44,6 +44,25 @@
 		"permissionDenied": "Разрешение на запись запрещено. Пожалуйста, разрешите запись экрана.",
 		"accessibilityAllowAndRetry": "Разрешите OpenScreen доступ к Универсальному доступу, затем снова нажмите запись, чтобы начать обратный отсчет."
 	},
+	"autoCaptions": {
+		"button": "Автосубтитры",
+		"dialogTitle": "Автосубтитры",
+		"dialogDescription": "Выберите, сколько примерно слов показывать в одном субтитре. Время распределяется между словами фразы.",
+		"minWords": "Минимум слов в субтитре",
+		"maxWords": "Максимум слов в субтитре",
+		"wordsCount": "{{count}} слов",
+		"generate": "Создать",
+		"dialogCancel": "Отмена",
+		"generating": "Создание субтитров из звука…",
+		"loadingModel": "Загрузка речевой модели (при первом запуске скачивается ~75 МБ)…",
+		"transcribing": "Распознавание речи…",
+		"busy": "Создание субтитров уже выполняется.",
+		"done": "Добавлено субтитров: {{count}}.",
+		"noneHeard": "Речь не обнаружена.",
+		"noAudio": "В этом видео нет звука, пригодного для расшифровки.",
+		"failed": "Не удалось создать субтитры.",
+		"truncated": "Расшифрованы только первые {{minutes}} мин."
+	},
 	"emptyState": {
 		"title": "Нет открытых проектов",
 		"description": "Импортируйте видео для начала редактирования или загрузите существующий проект OpenScreen.",
diff --git a/src/i18n/locales/tr/editor.json b/src/i18n/locales/tr/editor.json
index de45a180f..89203e719 100644
--- a/src/i18n/locales/tr/editor.json
+++ b/src/i18n/locales/tr/editor.json
@@ -44,6 +44,25 @@
 		"cancel": "İptal",
 		"confirm": "Onayla"
 	},
+	"autoCaptions": {
+		"button": "Otomatik altyazılar",
+		"dialogTitle": "Otomatik altyazılar",
+		"dialogDescription": "Her altyazının aynı anda yaklaşık kaç kelime göstermesini istediğinizi seçin. Zamanlama, o ifadedeki kelimelere dağıtılır.",
+		"minWords": "Altyazı başına en az kelime",
+		"maxWords": "Altyazı başına en fazla kelime",
+		"wordsCount": "{{count}} kelime",
+		"generate": "Oluştur",
+		"dialogCancel": "İptal",
+		"generating": "Sesten altyazılar oluşturuluyor…",
+		"loadingModel": "Konuşma modeli yükleniyor (ilk kullanımda ~75 MB indirilir)…",
+		"transcribing": "Konuşma yazıya dökülüyor…",
+		"busy": "Altyazı oluşturma zaten devam ediyor.",
+		"done": "{{count}} altyazı eklendi.",
+		"noneHeard": "Konuşma algılanmadı.",
+		"noAudio": "Bu videoda yazıya dökülebilecek kullanılabilir bir ses yok.",
+		"failed": "Altyazılar oluşturulamadı.",
+		"truncated": "Yalnızca ilk {{minutes}} dakika yazıya döküldü."
+	},
 	"emptyState": {
 		"title": "Açık proje yok",
 		"description": "Düzenlemeye başlamak için bir video içe aktarın veya mevcut bir OpenScreen projesi yükleyin.",
diff --git a/src/i18n/locales/vi/editor.json b/src/i18n/locales/vi/editor.json
index 1875bb559..90004091e 100644
--- a/src/i18n/locales/vi/editor.json
+++ b/src/i18n/locales/vi/editor.json
@@ -44,6 +44,25 @@
 		"permissionDenied": "Quyền ghi hình bị từ chối. Vui lòng cho phép ghi màn hình.",
 		"accessibilityAllowAndRetry": "Cho phép OpenScreen truy cập Trợ năng, sau đó nhấn ghi lại để bắt đầu đếm ngược."
 	},
+	"autoCaptions": {
+		"button": "Phụ đề tự động",
+		"dialogTitle": "Phụ đề tự động",
+		"dialogDescription": "Chọn khoảng bao nhiêu từ mỗi phụ đề hiển thị cùng lúc. Thời gian được phân bổ cho các từ trong cụm từ đó.",
+		"minWords": "Số từ tối thiểu mỗi phụ đề",
+		"maxWords": "Số từ tối đa mỗi phụ đề",
+		"wordsCount": "{{count}} từ",
+		"generate": "Tạo",
+		"dialogCancel": "Hủy",
+		"generating": "Đang tạo phụ đề từ âm thanh…",
+		"loadingModel": "Đang tải mô hình giọng nói (lần đầu sử dụng sẽ tải ~75 MB)…",
+		"transcribing": "Đang chuyển lời nói thành văn bản…",
+		"busy": "Việc tạo phụ đề đang được tiến hành.",
+		"done": "Đã thêm {{count}} phụ đề.",
+		"noneHeard": "Không phát hiện thấy lời nói.",
+		"noAudio": "Video này không có âm thanh dùng được để chuyển thành văn bản.",
+		"failed": "Không thể tạo phụ đề.",
+		"truncated": "Chỉ {{minutes}} phút đầu tiên được chuyển thành văn bản."
+	},
 	"emptyState": {
 		"title": "Không có dự án nào được mở",
 		"description": "Nhập video để bắt đầu chỉnh sửa hoặc tải một dự án OpenScreen hiện có.",
diff --git a/src/i18n/locales/zh-CN/editor.json b/src/i18n/locales/zh-CN/editor.json
index d11f1dd95..58f6ae27b 100644
--- a/src/i18n/locales/zh-CN/editor.json
+++ b/src/i18n/locales/zh-CN/editor.json
@@ -44,6 +44,25 @@
 		"permissionDenied": "录屏权限被拒绝。请允许屏幕录制。",
 		"accessibilityAllowAndRetry": "允许 OpenScreen 使用辅助功能权限，然后再次按录制以开始倒计时。"
 	},
+	"autoCaptions": {
+		"button": "自动字幕",
+		"dialogTitle": "自动字幕",
+		"dialogDescription": "大致选择每条字幕一次显示多少个字词。时间会在该语句内的字词之间分配。",
+		"minWords": "每条字幕的最少字数",
+		"maxWords": "每条字幕的最多字数",
+		"wordsCount": "{{count}} 个词",
+		"generate": "生成",
+		"dialogCancel": "取消",
+		"generating": "正在从音频生成字幕…",
+		"loadingModel": "正在加载语音模型（首次使用将下载约 75 MB）…",
+		"transcribing": "正在转写语音…",
+		"busy": "字幕生成已在进行中。",
+		"done": "已添加 {{count}} 条字幕。",
+		"noneHeard": "未检测到语音。",
+		"noAudio": "此视频没有可用于转写的音频。",
+		"failed": "无法生成字幕。",
+		"truncated": "仅转写了最前 {{minutes}} 分钟。"
+	},
 	"emptyState": {
 		"title": "未打开任何项目",
 		"description": "导入视频开始编辑，或加载已有的 OpenScreen 项目。",
diff --git a/src/i18n/locales/zh-TW/editor.json b/src/i18n/locales/zh-TW/editor.json
index 131518713..8a6485409 100644
--- a/src/i18n/locales/zh-TW/editor.json
+++ b/src/i18n/locales/zh-TW/editor.json
@@ -44,6 +44,25 @@
 		"cameraNotFound": "找不到攝影機。",
 		"accessibilityAllowAndRetry": "允許 OpenScreen 使用輔助使用權限，然後再次按下錄製以開始倒數。"
 	},
+	"autoCaptions": {
+		"button": "自動字幕",
+		"dialogTitle": "自動字幕",
+		"dialogDescription": "大致選擇每條字幕一次顯示多少字詞。時間會在該語句內的字詞之間分配。",
+		"minWords": "每條字幕的最少字數",
+		"maxWords": "每條字幕的最多字數",
+		"wordsCount": "{{count}} 個詞",
+		"generate": "產生",
+		"dialogCancel": "取消",
+		"generating": "正在從音訊產生字幕…",
+		"loadingModel": "正在載入語音模型（首次使用將下載約 75 MB）…",
+		"transcribing": "正在轉錄語音…",
+		"busy": "字幕產生已在進行中。",
+		"done": "已新增 {{count}} 條字幕。",
+		"noneHeard": "未偵測到語音。",
+		"noAudio": "此影片沒有可用於轉寫的音訊。",
+		"failed": "無法產生字幕。",
+		"truncated": "僅轉寫了最前 {{minutes}} 分鐘。"
+	},
 	"emptyState": {
 		"title": "未開啟任何專案",
 		"description": "匯入影片以開始編輯，或載入現有的 OpenScreen 專案。",
diff --git a/src/lib/captioning/annotationsFromCaptions.test.ts b/src/lib/captioning/annotationsFromCaptions.test.ts
new file mode 100644
index 000000000..bbf26fed2
--- /dev/null
+++ b/src/lib/captioning/annotationsFromCaptions.test.ts
@@ -0,0 +1,178 @@
+import { describe, expect, it } from "vitest";
+
+import {
+	captionSegmentsToAnnotationRegions,
+	groupPhraseCaptionSegmentsIntoLines,
+	groupTimedCaptionWordsIntoLines,
+	reconcileAutoCaptionTimelineGaps,
+} from "./annotationsFromCaptions";
+
+describe("groupPhraseCaptionSegmentsIntoLines", () => {
+	it("preserves phrase boundaries when formatting phrase-timestamp captions", () => {
+		const lines = groupPhraseCaptionSegmentsIntoLines(
+			[
+				{ startSec: 0, endSec: 0.5, text: "alpha beta" },
+				{ startSec: 0.62, endSec: 1.6, text: "gamma delta" },
+			],
+			2,
+			2,
+		);
+
+		expect(lines).toHaveLength(2);
+		expect(lines[0]).toMatchObject({ text: "alpha beta", startSec: 0 });
+		expect(lines[1]).toMatchObject({ text: "gamma delta", startSec: 0.62 });
+		expect(lines[0]!.endSec).toBeLessThanOrEqual(0.62);
+	});
+
+	it("slices a single merged phrase into timed caption lines by word bounds", () => {
+		const lines = groupPhraseCaptionSegmentsIntoLines(
+			[{ startSec: 0, endSec: 1, text: "alpha beta gamma delta" }],
+			2,
+			2,
+		);
+
+		expect(lines).toHaveLength(2);
+		expect(lines[0]).toMatchObject({
+			startSec: 0,
+			endSec: 0.5,
+			text: "alpha beta",
+		});
+		expect(lines[1]).toMatchObject({
+			startSec: 0.5,
+			endSec: 1,
+			text: "gamma delta",
+		});
+	});
+});
+
+describe("captionSegmentsToAnnotationRegions", () => {
+	it("uses raw phrase timing instead of shifting caption boundaries", () => {
+		const { regions } = captionSegmentsToAnnotationRegions(
+			[
+				{ startSec: 0, endSec: 0.5, text: "first second" },
+				{ startSec: 0.62, endSec: 1.2, text: "third fourth" },
+			],
+			1,
+			1,
+			{ minWordsPerCaption: 2, maxWordsPerCaption: 2, timestampGranularity: "phrase" },
+		);
+
+		expect(regions).toHaveLength(2);
+		expect(regions[0]).toMatchObject({ startMs: 0, endMs: 500 });
+		expect(regions[1]).toMatchObject({ startMs: 620, endMs: 1200 });
+	});
+
+	it("preserves empty timeline space when word timestamps contain a real pause", () => {
+		const lines = groupTimedCaptionWordsIntoLines(
+			[
+				{ startSec: 0, endSec: 0.12, text: "first" },
+				{ startSec: 0.13, endSec: 0.28, text: "caption" },
+				{ startSec: 0.7, endSec: 0.83, text: "second" },
+				{ startSec: 0.84, endSec: 0.98, text: "caption" },
+			],
+			2,
+			2,
+		);
+
+		expect(lines).toHaveLength(2);
+		expect(lines[0]).toMatchObject({ startSec: 0, endSec: 0.28, text: "first caption" });
+		expect(lines[1]).toMatchObject({ startSec: 0.7, endSec: 0.98, text: "second caption" });
+	});
+
+	it("preserves repeated words before grouping in word mode", () => {
+		const { regions } = captionSegmentsToAnnotationRegions(
+			[
+				{ startSec: 0, endSec: 0.12, text: "I" },
+				{ startSec: 0.13, endSec: 0.25, text: "I" },
+			],
+			1,
+			1,
+			{ minWordsPerCaption: 2, maxWordsPerCaption: 2, timestampGranularity: "word" },
+		);
+
+		expect(regions).toHaveLength(1);
+		expect(regions[0]).toMatchObject({ content: "I I" });
+	});
+});
+
+describe("reconcileAutoCaptionTimelineGaps", () => {
+	it("does not change regions when the minimum enforced gap is zero", () => {
+		const regions = reconcileAutoCaptionTimelineGaps([
+			{
+				id: "annotation-1",
+				startMs: 0,
+				endMs: 120,
+				type: "text",
+				content: "one",
+				annotationSource: "auto-caption",
+				position: { x: 0, y: 0 },
+				size: { width: 10, height: 10 },
+				style: {
+					color: "#fff",
+					backgroundColor: "transparent",
+					fontSize: 24,
+					fontFamily: "Inter",
+					fontWeight: "normal",
+					fontStyle: "normal",
+					textDecoration: "none",
+					textAlign: "center",
+				},
+				zIndex: 1,
+			},
+			{
+				id: "manual-1",
+				startMs: 50,
+				endMs: 1000,
+				type: "text",
+				content: "manual",
+				position: { x: 10, y: 10 },
+				size: { width: 10, height: 10 },
+				style: {
+					color: "#fff",
+					backgroundColor: "transparent",
+					fontSize: 24,
+					fontFamily: "Inter",
+					fontWeight: "normal",
+					fontStyle: "normal",
+					textDecoration: "none",
+					textAlign: "center",
+				},
+				zIndex: 2,
+			},
+			{
+				id: "annotation-2",
+				startMs: 130,
+				endMs: 300,
+				type: "text",
+				content: "two",
+				annotationSource: "auto-caption",
+				position: { x: 0, y: 0 },
+				size: { width: 10, height: 10 },
+				style: {
+					color: "#fff",
+					backgroundColor: "transparent",
+					fontSize: 24,
+					fontFamily: "Inter",
+					fontWeight: "normal",
+					fontStyle: "normal",
+					textDecoration: "none",
+					textAlign: "center",
+				},
+				zIndex: 3,
+			},
+		]);
+
+		expect(regions.find((r) => r.id === "manual-1")).toMatchObject({
+			startMs: 50,
+			endMs: 1000,
+		});
+		expect(regions.find((r) => r.id === "annotation-1")).toMatchObject({
+			startMs: 0,
+			endMs: 120,
+		});
+		expect(regions.find((r) => r.id === "annotation-2")).toMatchObject({
+			startMs: 130,
+			endMs: 300,
+		});
+	});
+});
diff --git a/src/lib/captioning/annotationsFromCaptions.ts b/src/lib/captioning/annotationsFromCaptions.ts
new file mode 100644
index 000000000..0f6dc2af4
--- /dev/null
+++ b/src/lib/captioning/annotationsFromCaptions.ts
@@ -0,0 +1,618 @@
+import type { AnnotationRegion, AnnotationTextStyle } from "@/components/video-editor/types";
+
+import type { CaptionSegment } from "./transcribe";
+
+/** Wide lower-third bar; `position.x` is top-left as % of container, so center with (100 − width) / 2. */
+const CAPTION_WIDTH = 92;
+const CAPTION_HEIGHT = 12;
+const CAPTION_BOTTOM_MARGIN = 2;
+
+const CAPTION_POSITION = {
+	x: (100 - CAPTION_WIDTH) / 2,
+	y: 100 - CAPTION_HEIGHT - CAPTION_BOTTOM_MARGIN,
+};
+
+const CAPTION_SIZE = { width: CAPTION_WIDTH, height: CAPTION_HEIGHT };
+
+const CAPTION_STYLE: AnnotationTextStyle = {
+	color: "#ffffff",
+	backgroundColor: "rgba(255, 255, 255, 0)",
+	fontSize: 24,
+	fontFamily: "Inter",
+	fontWeight: "normal",
+	fontStyle: "normal",
+	textDecoration: "none",
+	textAlign: "center",
+};
+
+/**
+ * Nudge caption **starts** earlier (seconds). Whisper onsets are often slightly late vs. what you
+ * hear; do **not** apply the same offset to ends — that pulls lines off-screen too early.
+ */
+const AUTO_CAPTION_START_BIAS_SEC = 0;
+
+/**
+ * Extra time held after Whisper’s segment **end** (seconds). Model end times are often early vs.
+ * trailing vowels / room tone; this is separate from `AUTO_CAPTION_START_BIAS_SEC`.
+ */
+const AUTO_CAPTION_END_HOLD_SEC = 0;
+
+/** Inside one Whisper phrase, sub-lines can be shorter (do not steal time from neighbors). */
+const WORD_SPLIT_MIN_SPAN_SEC = 0.02;
+
+/** Brief linger after the last word in a line (seconds); trimmed if it would overlap the next line. */
+const CAPTION_LINE_END_TAIL_SEC = 0;
+
+/** A real silence between word-level timestamps should start a new caption run. */
+const WORD_RUN_BREAK_GAP_SEC = 0.24;
+
+/**
+ * Minimum time between consecutive caption regions on the timeline (seconds). Keeps a visible gap
+ * so blocks do not read as one clip; kept small so we do not erase natural short pauses between phrases.
+ */
+const MIN_CAPTION_TIMELINE_GAP_SEC = 0;
+
+/** Same text again with almost no gap or overlap — common Whisper / chunk artifact. */
+const DEDUPE_SAME_TEXT_MAX_GAP_SEC = 0.55;
+
+export const SAME_CONTENT_ECHO_MAX_GAP_SEC = 1.15;
+
+function normalizeCaptionKey(text: string): string {
+	return text
+		.trim()
+		.replace(/\s+/g, " ")
+		.replace(/[\u2018\u2019]/g, "'")
+		.replace(/[\u201C\u201D]/g, '"')
+		.toLowerCase()
+		.replace(/[.!?,;:]+$/g, "");
+}
+
+/** Legacy echo-collapse helper kept for reference while phrase timing uses raw model spans. */
+export function collapseSameContentEchoes(segments: CaptionSegment[]): CaptionSegment[] {
+	const sorted = [...segments]
+		.filter((s) => s.text.trim())
+		.sort((a, b) => a.startSec - b.startSec || a.endSec - b.endSec);
+	const out: CaptionSegment[] = [];
+	const lastIndexByKey = new Map<string, number>();
+
+	for (const seg of sorted) {
+		const key = normalizeCaptionKey(seg.text);
+		const hit = lastIndexByKey.get(key);
+		if (hit !== undefined) {
+			const prev = out[hit]!;
+			if (seg.startSec < prev.endSec + SAME_CONTENT_ECHO_MAX_GAP_SEC) {
+				prev.startSec = Math.min(prev.startSec, seg.startSec);
+				prev.endSec = Math.max(prev.endSec, seg.endSec);
+				continue;
+			}
+		}
+		out.push({
+			startSec: seg.startSec,
+			endSec: seg.endSec,
+			text: seg.text.trim(),
+		});
+		lastIndexByKey.set(key, out.length - 1);
+	}
+	return out;
+}
+
+/**
+ * Only merge segments that are almost back-to-back (Whisper often splits mid-phrase with a tiny gap).
+ * Wider gaps are usually silence or missed audio — merging those stretches word timing across dead air.
+ */
+/**
+ * Collapse adjacent duplicate lines (overlapping or tiny gap). Does not merge the same phrase
+ * repeated later in the video when separated by real silence.
+ */
+function dedupeAdjacentCaptionRepeats(segments: CaptionSegment[]): CaptionSegment[] {
+	const sorted = [...segments]
+		.filter((s) => s.text.trim())
+		.sort((a, b) => a.startSec - b.startSec || a.endSec - b.endSec);
+	const out: CaptionSegment[] = [];
+	for (const seg of sorted) {
+		const t = seg.text.trim();
+		const prev = out[out.length - 1];
+		if (prev && normalizeCaptionKey(prev.text) === normalizeCaptionKey(t)) {
+			const overlap = prev.endSec - seg.startSec;
+			const gap = seg.startSec - prev.endSec;
+			if (overlap > 0.015 || gap < DEDUPE_SAME_TEXT_MAX_GAP_SEC) {
+				prev.startSec = Math.min(prev.startSec, seg.startSec);
+				prev.endSec = Math.max(prev.endSec, seg.endSec);
+				continue;
+			}
+		}
+		out.push({ startSec: seg.startSec, endSec: seg.endSec, text: t });
+	}
+	return out;
+}
+
+/** Trim only real overlaps. Avoid synthetic lead/lag so caption timing matches model output. */
+function finalizeCaptionSegmentsForPlayback(segments: CaptionSegment[]): CaptionSegment[] {
+	const OVERLAP_TRIM_SEC = 0.002;
+
+	const sortedRaw = [...segments]
+		.filter((s) => s.text.trim())
+		.sort((a, b) => a.startSec - b.startSec || a.endSec - b.endSec);
+
+	const a = sortedRaw.map((seg) => {
+		let s = seg.startSec + AUTO_CAPTION_START_BIAS_SEC;
+		let e = seg.endSec + AUTO_CAPTION_END_HOLD_SEC;
+		s = Math.max(0, s);
+		if (e <= s) e = s + 0.02;
+		return { startSec: s, endSec: e, text: seg.text.trim() };
+	});
+
+	for (let i = 1; i < a.length; i++) {
+		if (a[i].startSec < a[i - 1].endSec - OVERLAP_TRIM_SEC) {
+			a[i - 1].endSec = Math.max(a[i - 1].startSec + 1e-4, a[i].startSec);
+		}
+	}
+
+	return a;
+}
+
+/** Default min gap between auto-caption blocks on the timeline (ms); matches `MIN_CAPTION_TIMELINE_GAP_SEC`. */
+export const DEFAULT_AUTO_CAPTION_MIN_GAP_MS = Math.round(MIN_CAPTION_TIMELINE_GAP_SEC * 1000);
+
+/**
+ * Enforces a minimum gap between consecutive `auto-caption` regions (by start time). Shortens the
+ * previous region's end when possible; otherwise shifts the following region later so edits on
+ * the timeline cannot squeeze caption blocks completely flush.
+ */
+export function reconcileAutoCaptionTimelineGaps(
+	regions: AnnotationRegion[],
+	minGapMs: number = DEFAULT_AUTO_CAPTION_MIN_GAP_MS,
+): AnnotationRegion[] {
+	const gap = Math.max(0, Math.round(minGapMs));
+	if (regions.length === 0 || gap === 0) return regions;
+
+	const autoCandidates = regions.filter((r) => r.annotationSource === "auto-caption");
+	if (autoCandidates.length <= 1) return regions;
+
+	const sorted = [...autoCandidates].sort((a, b) => a.startMs - b.startMs || a.endMs - b.endMs);
+	const fixed: AnnotationRegion[] = [];
+	let prev = { ...sorted[0]! };
+	fixed.push(prev);
+
+	for (let i = 1; i < sorted.length; i++) {
+		let cur = { ...sorted[i]! };
+		const minStart = prev.endMs + gap;
+
+		if (cur.startMs < minStart) {
+			const newPrevEnd = cur.startMs - gap;
+			if (newPrevEnd >= prev.startMs + 1) {
+				prev = { ...prev, endMs: newPrevEnd };
+				fixed[fixed.length - 1] = prev;
+			} else {
+				const dur = Math.max(1, cur.endMs - cur.startMs);
+				cur = { ...cur, startMs: minStart, endMs: minStart + dur };
+			}
+		}
+
+		fixed.push(cur);
+		prev = cur;
+	}
+
+	const fixedById = new Map(fixed.map((r) => [r.id, r]));
+	return regions.map((r) => fixedById.get(r.id) ?? r);
+}
+
+/** Join phrases that are close in time so the editor does not create dozens of separate overlays. */
+export function mergeAdjacentCaptionSegments(
+	segments: CaptionSegment[],
+	options?: { maxGapSec?: number; maxChars?: number; maxBlockDurationSec?: number },
+): CaptionSegment[] {
+	const maxGapSec = options?.maxGapSec ?? 1.35;
+	const maxChars = options?.maxChars ?? 320;
+	const maxBlockDurationSec = options?.maxBlockDurationSec ?? 12;
+
+	const sorted = [...segments].sort((a, b) => a.startSec - b.startSec);
+	const out: CaptionSegment[] = [];
+
+	for (const seg of sorted) {
+		const text = seg.text.trim();
+		if (!text) continue;
+
+		const prev = out[out.length - 1];
+		if (!prev) {
+			out.push({ startSec: seg.startSec, endSec: seg.endSec, text });
+			continue;
+		}
+
+		const gap = seg.startSec - prev.endSec;
+		const mergedText = `${prev.text} ${text}`.trim();
+		const mergedEnd = Math.max(prev.endSec, seg.endSec);
+		const wouldSpan = mergedEnd - prev.startSec;
+		if (gap <= maxGapSec && mergedText.length <= maxChars && wouldSpan <= maxBlockDurationSec) {
+			prev.endSec = mergedEnd;
+			prev.text = mergedText;
+		} else {
+			out.push({ startSec: seg.startSec, endSec: seg.endSec, text });
+		}
+	}
+
+	return out;
+}
+
+function partitionPhraseCaptionSegments(
+	segments: CaptionSegment[],
+	options?: { maxGapSec?: number; maxChars?: number; maxBlockDurationSec?: number },
+): CaptionSegment[][] {
+	const maxGapSec = options?.maxGapSec ?? 0;
+	const maxChars = options?.maxChars ?? Number.POSITIVE_INFINITY;
+	const maxBlockDurationSec = options?.maxBlockDurationSec ?? Number.POSITIVE_INFINITY;
+
+	const sorted = [...segments]
+		.filter((s) => s.text.trim())
+		.sort((a, b) => a.startSec - b.startSec || a.endSec - b.endSec);
+	if (sorted.length === 0) return [];
+
+	const groups: CaptionSegment[][] = [];
+	let current: CaptionSegment[] = [];
+
+	for (const seg of sorted) {
+		const text = seg.text.trim();
+		if (!text) continue;
+
+		if (current.length === 0) {
+			current.push({ ...seg, text });
+			continue;
+		}
+
+		const prev = current[current.length - 1]!;
+		const groupStart = current[0]!.startSec;
+		const gap = seg.startSec - prev.endSec;
+		const currentChars = current.reduce((sum, item) => sum + item.text.length, 0);
+		const wouldChars = currentChars + 1 + text.length;
+		const wouldSpan = Math.max(prev.endSec, seg.endSec) - groupStart;
+
+		if (gap <= maxGapSec && wouldChars <= maxChars && wouldSpan <= maxBlockDurationSec) {
+			current.push({ ...seg, text });
+			continue;
+		}
+
+		groups.push(current);
+		current = [{ ...seg, text }];
+	}
+
+	if (current.length > 0) {
+		groups.push(current);
+	}
+
+	return groups;
+}
+
+export interface CaptionSegmentLayoutOptions {
+	/** Lower bound on words per on-screen caption (default 2). */
+	minWordsPerCaption?: number;
+	/** Upper bound on words per on-screen caption (default 7). */
+	maxWordsPerCaption?: number;
+	/**
+	 * `word`: each `CaptionSegment` is a single token with Whisper word timestamps (default).
+	 * `phrase`: merged phrase spans; use proportional line splitting inside each span.
+	 */
+	timestampGranularity?: "word" | "phrase";
+}
+
+function computeCaptionLineIndexRanges(
+	wordCount: number,
+	minWords: number,
+	maxWords: number,
+): Array<{ from: number; to: number }> {
+	const minW = Math.max(1, Math.min(Math.floor(minWords), Math.floor(maxWords)));
+	const maxW = Math.max(minW, Math.floor(maxWords));
+	const sliceRanges: Array<{ from: number; to: number }> = [];
+	let i = 0;
+	while (i < wordCount) {
+		const remaining = wordCount - i;
+		if (remaining <= maxW) {
+			if (sliceRanges.length > 0 && remaining < minW) {
+				sliceRanges[sliceRanges.length - 1]!.to = wordCount;
+			} else {
+				sliceRanges.push({ from: i, to: wordCount });
+			}
+			break;
+		}
+
+		let take = maxW;
+		const after = remaining - take;
+		if (after > 0 && after < minW) {
+			take = remaining - minW;
+			if (take < minW) {
+				sliceRanges.push({ from: i, to: wordCount });
+				break;
+			}
+			if (take > maxW) {
+				take = maxW;
+			}
+		}
+		sliceRanges.push({ from: i, to: i + take });
+		i += take;
+	}
+	return sliceRanges;
+}
+
+/**
+ * Groups per-word segments into on-screen lines using each token's Whisper timestamps
+ * (no proportional stretching across a long phrase span).
+ */
+export function groupTimedCaptionWordsIntoLines(
+	segments: CaptionSegment[],
+	minWords: number,
+	maxWords: number,
+): CaptionSegment[] {
+	const words = [...segments]
+		.filter((s) => s.text.trim())
+		.sort((a, b) => a.startSec - b.startSec || a.endSec - b.endSec);
+	if (words.length === 0) return [];
+
+	const minW = Math.max(1, Math.min(Math.floor(minWords), Math.floor(maxWords)));
+	const maxW = Math.max(minW, Math.floor(maxWords));
+	const out: CaptionSegment[] = [];
+
+	let runStart = 0;
+	const flushRun = (runEndExclusive: number) => {
+		const run = words.slice(runStart, runEndExclusive);
+		if (run.length === 0) return;
+		const ranges = computeCaptionLineIndexRanges(run.length, minW, maxW);
+		for (const { from, to } of ranges) {
+			const slice = run.slice(from, to);
+			const s = slice[0]!.startSec;
+			const rawEnd = slice[slice.length - 1]!.endSec;
+			const e = Math.max(s + WORD_SPLIT_MIN_SPAN_SEC, rawEnd + CAPTION_LINE_END_TAIL_SEC);
+			out.push({
+				startSec: s,
+				endSec: e,
+				text: slice.map((w) => w.text.trim()).join(" "),
+			});
+		}
+	};
+
+	for (let i = 1; i < words.length; i++) {
+		const prev = words[i - 1]!;
+		const cur = words[i]!;
+		const gap = cur.startSec - prev.endSec;
+		if (gap >= WORD_RUN_BREAK_GAP_SEC) {
+			flushRun(i);
+			runStart = i;
+		}
+	}
+	flushRun(words.length);
+
+	for (let i = 0; i < out.length - 1; i++) {
+		if (out[i]!.endSec > out[i + 1]!.startSec + 1e-3) {
+			out[i]!.endSec = Math.max(
+				out[i]!.startSec + WORD_SPLIT_MIN_SPAN_SEC,
+				out[i + 1]!.startSec - 1e-4,
+			);
+		}
+	}
+	return out;
+}
+
+/**
+ * Splits each merged transcription span into shorter captions with about
+ * `minWords`–`maxWords` words. Times are interpolated by character weight inside the span.
+ */
+export function splitMergedCaptionsByWordBounds(
+	merged: CaptionSegment[],
+	minWords: number,
+	maxWords: number,
+): CaptionSegment[] {
+	const minW = Math.max(1, Math.min(Math.floor(minWords), Math.floor(maxWords)));
+	const maxW = Math.max(minW, Math.floor(maxWords));
+	const out: CaptionSegment[] = [];
+
+	for (const seg of merged) {
+		const words = seg.text.trim().split(/\s+/).filter(Boolean);
+		if (words.length === 0) continue;
+
+		if (words.length <= maxW) {
+			out.push({
+				startSec: seg.startSec,
+				endSec: seg.endSec,
+				text: words.join(" "),
+			});
+			continue;
+		}
+
+		out.push(...splitOneSegmentByWordBounds(seg.startSec, seg.endSec, words, minW, maxW));
+	}
+
+	return out;
+}
+
+function wrapCaptionTextByWordBounds(text: string, minWords: number, maxWords: number): string {
+	const words = text.trim().split(/\s+/).filter(Boolean);
+	if (words.length === 0) return "";
+	const minW = Math.max(1, Math.min(Math.floor(minWords), Math.floor(maxWords)));
+	const maxW = Math.max(minW, Math.floor(maxWords));
+	const ranges = computeCaptionLineIndexRanges(words.length, minW, maxW);
+	return ranges.map(({ from, to }) => words.slice(from, to).join(" ")).join("\n");
+}
+
+function expandPhraseSegmentToPseudoWords(segment: CaptionSegment): CaptionSegment[] {
+	const words = segment.text.trim().split(/\s+/).filter(Boolean);
+	if (words.length === 0) return [];
+	if (words.length === 1) {
+		return [
+			{
+				startSec: segment.startSec,
+				endSec: segment.endSec,
+				text: words[0]!,
+			},
+		];
+	}
+
+	return splitOneSegmentByWordBounds(segment.startSec, segment.endSec, words, 1, 1);
+}
+
+export function groupPhraseCaptionSegmentsIntoLines(
+	segments: CaptionSegment[],
+	minWords: number,
+	maxWords: number,
+	options?: { maxGapSec?: number; maxChars?: number; maxBlockDurationSec?: number },
+): CaptionSegment[] {
+	const groups = partitionPhraseCaptionSegments(segments, options);
+	const out: CaptionSegment[] = [];
+
+	for (const group of groups) {
+		if (group.length === 1) {
+			const only = group[0]!;
+			const wrapped = wrapCaptionTextByWordBounds(only.text, minWords, maxWords).trim();
+			if (!wrapped) continue;
+			const lineTexts = wrapped
+				.split("\n")
+				.map((t) => t.trim())
+				.filter(Boolean);
+			const n = lineTexts.length;
+			const rawDur = only.endSec - only.startSec;
+			if (n > 1 && rawDur < n * WORD_SPLIT_MIN_SPAN_SEC) {
+				out.push({
+					startSec: only.startSec,
+					endSec: only.endSec,
+					text: lineTexts.join(" "),
+				});
+				continue;
+			}
+			const dur = Math.max(rawDur, WORD_SPLIT_MIN_SPAN_SEC * n);
+			if (n <= 1) {
+				out.push({
+					startSec: only.startSec,
+					endSec: only.endSec,
+					text: lineTexts[0] ?? wrapped,
+				});
+				continue;
+			}
+			for (let i = 0; i < n; i++) {
+				const startSec = only.startSec + (dur * i) / n;
+				const boundary = only.startSec + (dur * (i + 1)) / n;
+				const endSec =
+					i === n - 1 ? only.endSec : Math.max(startSec + WORD_SPLIT_MIN_SPAN_SEC, boundary);
+				out.push({
+					startSec,
+					endSec,
+					text: lineTexts[i]!,
+				});
+			}
+			continue;
+		}
+
+		const pseudoWords = group.flatMap(expandPhraseSegmentToPseudoWords);
+		out.push(...groupTimedCaptionWordsIntoLines(pseudoWords, minWords, maxWords));
+	}
+
+	return out;
+}
+
+function splitOneSegmentByWordBounds(
+	startSec: number,
+	endSec: number,
+	words: string[],
+	minWords: number,
+	maxWords: number,
+): CaptionSegment[] {
+	const sliceRanges = computeCaptionLineIndexRanges(words.length, minWords, maxWords);
+
+	const dur = Math.max(endSec - startSec, 0.05);
+	const weights = words.map((w) => Math.max(1, w.length));
+	const totalW = weights.reduce((a, b) => a + b, 0);
+
+	const weightSum = (from: number, to: number) => {
+		let s = 0;
+		for (let k = from; k < to; k++) s += weights[k] ?? 0;
+		return s;
+	};
+
+	const result: CaptionSegment[] = [];
+	let prevEnd = startSec;
+	for (const { from, to } of sliceRanges) {
+		const wb = weightSum(0, from);
+		const ws = weightSum(from, to);
+		let s = startSec + (wb / totalW) * dur;
+		let e = startSec + ((wb + ws) / totalW) * dur;
+		s = Math.max(s, prevEnd);
+		e = Math.max(s + WORD_SPLIT_MIN_SPAN_SEC, e);
+		e = Math.min(e, endSec);
+		if (e <= s) {
+			e = Math.min(endSec, s + WORD_SPLIT_MIN_SPAN_SEC);
+		}
+		prevEnd = e;
+		result.push({
+			startSec: s,
+			endSec: e,
+			text: words.slice(from, to).join(" "),
+		});
+	}
+	if (result.length > 0) {
+		result[result.length - 1].endSec = endSec;
+		for (let i = 0; i < result.length - 1; i++) {
+			if (result[i].endSec > result[i + 1].startSec + 0.002) {
+				result[i].endSec = Math.max(result[i].startSec + 1e-4, result[i + 1].startSec);
+			}
+		}
+	}
+	return result;
+}
+
+export function captionSegmentsToAnnotationRegions(
+	segments: CaptionSegment[],
+	startNumericId: number,
+	startZIndex: number,
+	layout?: CaptionSegmentLayoutOptions,
+): { regions: AnnotationRegion[]; nextNumericId: number; nextZIndex: number } {
+	// Do not echo-collapse raw word tokens before grouping: repeated words ("I … I") share a
+	// normalized key and would merge spans while keeping only the first token's text.
+	const minW = layout?.minWordsPerCaption ?? 2;
+	const maxW = layout?.maxWordsPerCaption ?? 7;
+	const granularity = layout?.timestampGranularity ?? "word";
+
+	const grouped =
+		granularity === "phrase"
+			? groupPhraseCaptionSegmentsIntoLines(segments, minW, maxW)
+			: groupTimedCaptionWordsIntoLines(segments, minW, maxW);
+
+	const dedupedOut = dedupeAdjacentCaptionRepeats(grouped);
+	const finalized = finalizeCaptionSegmentsForPlayback(dedupedOut);
+
+	let nid = startNumericId;
+	let z = startZIndex;
+	const regions: AnnotationRegion[] = [];
+
+	for (const seg of finalized) {
+		const startMs = Math.round(seg.startSec * 1000);
+		const endMs = Math.max(Math.round(seg.endSec * 1000), startMs + 1);
+		regions.push({
+			id: `annotation-${nid++}`,
+			startMs,
+			endMs,
+			type: "text",
+			content: seg.text,
+			annotationSource: "auto-caption",
+			position: { ...CAPTION_POSITION },
+			size: { ...CAPTION_SIZE },
+			style: { ...CAPTION_STYLE },
+			zIndex: z++,
+		});
+	}
+
+	return {
+		regions: reconcileAutoCaptionTimelineGaps(regions),
+		nextNumericId: nid,
+		nextZIndex: z,
+	};
+}
+
+export function maxAnnotationNumericId(regions: AnnotationRegion[]): number {
+	let max = 0;
+	for (const r of regions) {
+		const m = /^annotation-(\d+)$/.exec(r.id);
+		if (m) max = Math.max(max, Number.parseInt(m[1], 10));
+	}
+	return max;
+}
+
+export function maxAnnotationZIndex(regions: AnnotationRegion[]): number {
+	if (regions.length === 0) return 0;
+	return Math.max(...regions.map((r) => r.zIndex));
+}
diff --git a/src/lib/captioning/captionConstants.ts b/src/lib/captioning/captionConstants.ts
new file mode 100644
index 000000000..1bacb7cc7
--- /dev/null
+++ b/src/lib/captioning/captionConstants.ts
@@ -0,0 +1,2 @@
+/** Max audio length for auto-captions (decode + transcribe); keep demuxer read aligned with this. */
+export const MAX_CAPTION_AUDIO_SEC = 4 * 60 * 60;
diff --git a/src/lib/captioning/extractMono16k.ts b/src/lib/captioning/extractMono16k.ts
new file mode 100644
index 000000000..53258567c
--- /dev/null
+++ b/src/lib/captioning/extractMono16k.ts
@@ -0,0 +1,159 @@
+import { MAX_CAPTION_AUDIO_SEC } from "./captionConstants";
+import { extractMonoPcmViaWebDemuxer } from "./extractMono16kWebDemuxer";
+
+export { MAX_CAPTION_AUDIO_SEC };
+
+const FETCH_TIMEOUT_MS = 120_000;
+
+async function fetchWithTimeout(url: string, signal?: AbortSignal): Promise<Response> {
+	const ctrl = new AbortController();
+	const timer = window.setTimeout(() => ctrl.abort(), FETCH_TIMEOUT_MS);
+	const onAbort = () => ctrl.abort();
+	if (signal) {
+		if (signal.aborted) ctrl.abort();
+		else signal.addEventListener("abort", onAbort, { once: true });
+	}
+	try {
+		return await fetch(url, { signal: ctrl.signal });
+	} finally {
+		window.clearTimeout(timer);
+		if (signal) signal.removeEventListener("abort", onAbort);
+	}
+}
+
+/**
+ * Load the editor video the same way as `StreamingVideoDecoder`:
+ * Electron `readBinaryFile` for local paths (fetch(file://) is unreliable in the renderer),
+ * otherwise HTTP / blob / data URLs via fetch.
+ */
+async function loadSourceVideoFile(videoUrl: string, signal?: AbortSignal): Promise<File> {
+	const isRemoteUrl = /^(https?:|blob:|data:)/i.test(videoUrl);
+
+	if (!isRemoteUrl && window.electronAPI?.readBinaryFile) {
+		const result = await window.electronAPI.readBinaryFile(videoUrl);
+		if (!result.success || !result.data) {
+			throw new Error(result.message || result.error || "Failed to read source video");
+		}
+		const filename = (result.path || videoUrl).split(/[\\/]/).pop() || "video";
+		return new File([result.data], filename, { type: "video/webm" });
+	}
+
+	const response = await fetchWithTimeout(videoUrl, signal);
+	if (!response.ok) {
+		throw new Error(`Failed to load video for captions: ${response.status} ${response.statusText}`);
+	}
+	const blob = await response.blob();
+	if (signal?.aborted) throw new DOMException("Aborted", "AbortError");
+	const filename = videoUrl.split("/").pop() || "video";
+	return new File([blob], filename, { type: blob.type || "video/webm" });
+}
+
+function mixToMono(audioBuffer: AudioBuffer): Float32Array {
+	const { length, numberOfChannels } = audioBuffer;
+	const out = new Float32Array(length);
+	if (numberOfChannels === 0) return out;
+	for (let i = 0; i < length; i++) {
+		let sum = 0;
+		for (let c = 0; c < numberOfChannels; c++) {
+			sum += audioBuffer.getChannelData(c)[i];
+		}
+		out[i] = sum / numberOfChannels;
+	}
+	return out;
+}
+
+async function resampleMono(
+	mono: Float32Array,
+	fromRate: number,
+	toRate: number,
+	signal?: AbortSignal,
+): Promise<Float32Array> {
+	if (signal?.aborted) throw new DOMException("Aborted", "AbortError");
+	if (fromRate === toRate) return mono;
+	const durationSec = mono.length / fromRate;
+	const outLength = Math.max(1, Math.ceil(durationSec * toRate));
+	const offline = new OfflineAudioContext(1, outLength, toRate);
+	const buf = offline.createBuffer(1, mono.length, fromRate);
+	buf.copyToChannel(Float32Array.from(mono), 0);
+	const src = offline.createBufferSource();
+	src.buffer = buf;
+	src.connect(offline.destination);
+	src.start(0);
+	const rendered = await offline.startRendering();
+	if (signal?.aborted) throw new DOMException("Aborted", "AbortError");
+	return rendered.getChannelData(0).slice();
+}
+
+async function truncateAndResampleTo16k(
+	mono: Float32Array,
+	fromRate: number,
+	durationSec: number,
+	signal?: AbortSignal,
+): Promise<{ samples: Float32Array; truncated: boolean; durationSec: number }> {
+	let truncated = false;
+	let work = mono;
+	if (durationSec > MAX_CAPTION_AUDIO_SEC) {
+		const maxSamples = Math.floor(MAX_CAPTION_AUDIO_SEC * fromRate);
+		work = mono.subarray(0, Math.min(mono.length, maxSamples));
+		truncated = true;
+	}
+
+	const samples = await resampleMono(work, fromRate, 16_000, signal);
+	return { samples, truncated, durationSec: samples.length / 16_000 };
+}
+
+/**
+ * Decode the video's audio track to mono 16 kHz float samples (Whisper input).
+ * Prefers `decodeAudioData` when the container is supported; otherwise uses the same
+ * web-demuxer + AudioDecoder path as export.
+ */
+export async function extractMono16kFromVideoUrl(
+	videoUrl: string,
+	options?: { signal?: AbortSignal },
+): Promise<{ samples: Float32Array; truncated: boolean; durationSec: number }> {
+	const file = await loadSourceVideoFile(videoUrl, options?.signal);
+
+	/** When this returns null, use web-demuxer + AudioDecoder (same as export). */
+	const tryDecodeAudioDataPath = async (): Promise<{
+		samples: Float32Array;
+		truncated: boolean;
+		durationSec: number;
+	} | null> => {
+		const audioContext = new AudioContext();
+		try {
+			const ab = await file.arrayBuffer();
+			if (options?.signal?.aborted) throw new DOMException("Aborted", "AbortError");
+			const audioBuffer = await audioContext.decodeAudioData(ab.slice(0));
+			if (
+				audioBuffer.numberOfChannels === 0 ||
+				audioBuffer.length === 0 ||
+				!Number.isFinite(audioBuffer.duration) ||
+				audioBuffer.duration <= 0
+			) {
+				return null;
+			}
+			const durationSec = audioBuffer.duration;
+			const mono = mixToMono(audioBuffer);
+			const fromRate = audioBuffer.sampleRate;
+			const out = await truncateAndResampleTo16k(mono, fromRate, durationSec, options?.signal);
+			// decodeAudioData can resolve for some WebM/Matroska inputs yet yield almost no usable
+			// PCM; captions only run the demuxer path on throw today, so we never recover.
+			if (out.samples.length < 800) {
+				return null;
+			}
+			return out;
+		} catch {
+			return null;
+		} finally {
+			await audioContext.close().catch(() => undefined);
+		}
+	};
+
+	const primary = await tryDecodeAudioDataPath();
+	if (primary) {
+		return primary;
+	}
+
+	const pcm = await extractMonoPcmViaWebDemuxer(file, options?.signal);
+	return truncateAndResampleTo16k(pcm.mono, pcm.sampleRate, pcm.durationSec, options?.signal);
+}
diff --git a/src/lib/captioning/extractMono16kWebDemuxer.ts b/src/lib/captioning/extractMono16kWebDemuxer.ts
new file mode 100644
index 000000000..fd85f5703
--- /dev/null
+++ b/src/lib/captioning/extractMono16kWebDemuxer.ts
@@ -0,0 +1,187 @@
+import { WebDemuxer } from "web-demuxer";
+
+import { MAX_CAPTION_AUDIO_SEC } from "./captionConstants";
+
+const DECODE_QUEUE_BACKPRESSURE = 20;
+const SOURCE_LOAD_TIMEOUT_MS = 60_000;
+const READ_END_PADDING_SEC = 0.5;
+
+function webDemuxerWasmUrl(): string {
+	return new URL("../exporter/wasm/web-demuxer.wasm", window.location.href).href;
+}
+
+function audioDataFrameToMono(frame: AudioData): Float32Array {
+	const frames = frame.numberOfFrames;
+	const ch = frame.numberOfChannels;
+	const out = new Float32Array(frames);
+	const fmt = frame.format || "";
+	const planar = fmt.includes("planar");
+
+	if (planar) {
+		const plane = new Float32Array(frames);
+		for (let c = 0; c < ch; c++) {
+			frame.copyTo(plane, { planeIndex: c });
+			for (let i = 0; i < frames; i++) {
+				out[i] += plane[i];
+			}
+		}
+		for (let i = 0; i < frames; i++) {
+			out[i] /= ch;
+		}
+	} else {
+		const interleaved = new Float32Array(frames * ch);
+		frame.copyTo(interleaved, { planeIndex: 0 });
+		for (let i = 0; i < frames; i++) {
+			let sum = 0;
+			for (let c = 0; c < ch; c++) {
+				sum += interleaved[i * ch + c];
+			}
+			out[i] = sum / ch;
+		}
+	}
+	return out;
+}
+
+function mergeAndConsumeDecodedAudioToMonoLinear(
+	frames: AudioData[],
+	sampleRate: number,
+	durationSec: number,
+): Float32Array {
+	const sorted = [...frames].sort((a, b) => a.timestamp - b.timestamp);
+	const totalSamples = Math.max(1, Math.ceil(durationSec * sampleRate));
+	const acc = new Float32Array(totalSamples);
+	const weight = new Float32Array(totalSamples);
+
+	for (const frame of sorted) {
+		const startSample = Math.round((frame.timestamp / 1e6) * sampleRate);
+		const slice = audioDataFrameToMono(frame);
+		for (let i = 0; i < slice.length; i++) {
+			const pos = startSample + i;
+			if (pos >= 0 && pos < totalSamples) {
+				acc[pos] += slice[i];
+				weight[pos] += 1;
+			}
+		}
+		frame.close();
+	}
+
+	for (let i = 0; i < totalSamples; i++) {
+		if (weight[i] > 0) {
+			acc[i] /= weight[i];
+		}
+	}
+	return acc;
+}
+
+function withTimeout<T>(promise: Promise<T>, ms: number, message: string): Promise<T> {
+	return new Promise<T>((resolve, reject) => {
+		const id = window.setTimeout(() => reject(new Error(message)), ms);
+		promise
+			.then((v) => {
+				window.clearTimeout(id);
+				resolve(v);
+			})
+			.catch((e) => {
+				window.clearTimeout(id);
+				reject(e instanceof Error ? e : new Error(String(e)));
+			});
+	});
+}
+
+/**
+ * Demux + WebCodecs audio decode (same stack as export). Use when
+ * `decodeAudioData` cannot handle the container (e.g. WebM with video).
+ */
+export async function extractMonoPcmViaWebDemuxer(
+	file: File,
+	signal?: AbortSignal,
+): Promise<{ mono: Float32Array; sampleRate: number; durationSec: number }> {
+	const demuxer = new WebDemuxer({ wasmFilePath: webDemuxerWasmUrl() });
+	await withTimeout(
+		demuxer.load(file),
+		SOURCE_LOAD_TIMEOUT_MS,
+		"Timed out while parsing the source video for captions.",
+	);
+
+	if (signal?.aborted) throw new DOMException("Aborted", "AbortError");
+
+	const mediaInfo = await withTimeout(
+		demuxer.getMediaInfo(),
+		SOURCE_LOAD_TIMEOUT_MS,
+		"Timed out while reading media info for captions.",
+	);
+
+	const reportedDurationSec =
+		Number.isFinite(mediaInfo.duration) && mediaInfo.duration > 0 ? mediaInfo.duration : 0;
+
+	let audioConfig: AudioDecoderConfig;
+	try {
+		audioConfig = await demuxer.getDecoderConfig("audio");
+	} catch {
+		throw new Error("No audio track found in this video.");
+	}
+
+	const codecCheck = await AudioDecoder.isConfigSupported(audioConfig);
+	if (!codecCheck.supported) {
+		throw new Error(`Audio codec not supported for captions: ${audioConfig.codec}`);
+	}
+
+	const sampleRate = audioConfig.sampleRate || 48_000;
+
+	// Many WebM/Matroska files report a too-short duration; capping read at reported time stops
+	// demux early and mergeAndConsumeDecodedAudioToMonoLinear clips everything past that. Read up to the
+	// same ceiling as caption decode (demuxer stops when the track ends).
+	const readEndSec = MAX_CAPTION_AUDIO_SEC + READ_END_PADDING_SEC;
+	const decodedFrames: AudioData[] = [];
+
+	const decoder = new AudioDecoder({
+		output: (data: AudioData) => decodedFrames.push(data),
+		error: (e: DOMException) => console.error("[captioning] AudioDecoder error:", e),
+	});
+	decoder.configure(audioConfig);
+
+	const reader = demuxer.read("audio", 0, readEndSec).getReader();
+	try {
+		while (!signal?.aborted) {
+			const { done, value: chunk } = await reader.read();
+			if (done || !chunk) break;
+			decoder.decode(chunk);
+			while (decoder.decodeQueueSize > DECODE_QUEUE_BACKPRESSURE && !signal?.aborted) {
+				await new Promise((r) => setTimeout(r, 1));
+			}
+		}
+	} finally {
+		try {
+			await reader.cancel();
+		} catch {
+			/* already closed */
+		}
+	}
+
+	if (decoder.state === "configured") {
+		await decoder.flush();
+		decoder.close();
+	}
+
+	if (signal?.aborted) {
+		for (const f of decodedFrames) f.close();
+		throw new DOMException("Aborted", "AbortError");
+	}
+
+	if (decodedFrames.length === 0) {
+		throw new Error("Decoded zero audio frames from this video.");
+	}
+
+	let maxEndUs = 0;
+	for (const f of decodedFrames) {
+		const end = f.timestamp + (f.duration ?? 0);
+		if (end > maxEndUs) maxEndUs = end;
+	}
+	const inferredDurationSec = maxEndUs / 1e6;
+	// Prefer extent implied by decoded frames (fixes bad container duration). If frames lack
+	// duration, fall back to reported metadata.
+	const durationSec = inferredDurationSec > 0.02 ? inferredDurationSec : reportedDurationSec;
+
+	const mono = mergeAndConsumeDecodedAudioToMonoLinear(decodedFrames, sampleRate, durationSec);
+	return { mono, sampleRate, durationSec };
+}
diff --git a/src/lib/captioning/index.ts b/src/lib/captioning/index.ts
new file mode 100644
index 000000000..cc2e2a3a6
--- /dev/null
+++ b/src/lib/captioning/index.ts
@@ -0,0 +1,17 @@
+export type { CaptionSegmentLayoutOptions } from "./annotationsFromCaptions";
+export {
+	captionSegmentsToAnnotationRegions,
+	DEFAULT_AUTO_CAPTION_MIN_GAP_MS,
+	groupTimedCaptionWordsIntoLines,
+	mergeAdjacentCaptionSegments,
+	reconcileAutoCaptionTimelineGaps,
+	splitMergedCaptionsByWordBounds,
+} from "./annotationsFromCaptions";
+export { extractMono16kFromVideoUrl, MAX_CAPTION_AUDIO_SEC } from "./extractMono16k";
+export { shiftTrimRegionsMsForCaptionBuffer, trimLeadingSilenceMono16k } from "./leadingSilence";
+export type {
+	CaptionSegment,
+	CaptionTimestampGranularity,
+	TranscribeMono16kResult,
+} from "./transcribe";
+export { transcribeMono16kToSegments } from "./transcribe";
diff --git a/src/lib/captioning/leadingSilence.ts b/src/lib/captioning/leadingSilence.ts
new file mode 100644
index 000000000..4bd6a11aa
--- /dev/null
+++ b/src/lib/captioning/leadingSilence.ts
@@ -0,0 +1,78 @@
+/** Caption path is always mono 16 kHz after `extractMono16kFromVideoUrl`. */
+import type { TrimRegion } from "@/components/video-editor/types";
+
+const SAMPLE_RATE = 16_000;
+
+/** Window length for peak detection (~50 ms). */
+const WINDOW_SAMPLES = 800;
+
+/** Coarse hop so long intros scan quickly (~50 ms steps). */
+const HOP_SAMPLES = 800;
+
+/** Max |sample| in a window below this counts as silence (float PCM ~[-1, 1]). */
+const PEAK_THRESHOLD = 0.012;
+
+/** Keep a little audio before the first peak so word onsets are not clipped. */
+const PRE_ROLL_SEC = 0.12;
+
+/** Do not scan more than this much audio for leading silence (performance + pathological files). */
+const MAX_LEADING_SCAN_SEC = 15 * 60;
+
+/**
+ * Drops quiet audio at the beginning so Whisper is not fed a long silent prefix (which can skew
+ * the first phrase and wastes work). Returned `trimSec` must be added back to every segment time.
+ */
+export function trimLeadingSilenceMono16k(samples: Float32Array): {
+	samples: Float32Array;
+	trimSec: number;
+} {
+	if (samples.length < WINDOW_SAMPLES) {
+		return { samples, trimSec: 0 };
+	}
+
+	const maxIndex = Math.min(
+		samples.length - WINDOW_SAMPLES,
+		Math.floor(MAX_LEADING_SCAN_SEC * SAMPLE_RATE),
+	);
+
+	let firstSpeechSample = -1;
+	for (let i = 0; i <= maxIndex; i += HOP_SAMPLES) {
+		let peak = 0;
+		for (let j = 0; j < WINDOW_SAMPLES; j++) {
+			peak = Math.max(peak, Math.abs(samples[i + j]!));
+		}
+		if (peak > PEAK_THRESHOLD) {
+			firstSpeechSample = i;
+			break;
+		}
+	}
+
+	if (firstSpeechSample <= 0) {
+		return { samples, trimSec: 0 };
+	}
+
+	const preRollSamples = Math.round(PRE_ROLL_SEC * SAMPLE_RATE);
+	const start = Math.max(0, firstSpeechSample - preRollSamples);
+	return {
+		samples: samples.subarray(start),
+		trimSec: start / SAMPLE_RATE,
+	};
+}
+
+/**
+ * When audio is trimmed from the front, Whisper times are relative to the shortened buffer.
+ * Shift trim regions by the same offset so `segmentOverlapsTrim` still uses consistent coordinates.
+ */
+export function shiftTrimRegionsMsForCaptionBuffer(
+	regions: TrimRegion[],
+	trimMs: number,
+): TrimRegion[] {
+	if (trimMs <= 0) return regions;
+	return regions
+		.map((r) => ({
+			...r,
+			startMs: Math.max(0, r.startMs - trimMs),
+			endMs: Math.max(0, r.endMs - trimMs),
+		}))
+		.filter((r) => r.endMs > r.startMs);
+}
diff --git a/src/lib/captioning/transcribe.ts b/src/lib/captioning/transcribe.ts
new file mode 100644
index 000000000..91f1d91f0
--- /dev/null
+++ b/src/lib/captioning/transcribe.ts
@@ -0,0 +1,91 @@
+import type { TrimRegion } from "@/components/video-editor/types";
+
+export interface CaptionSegment {
+	startSec: number;
+	endSec: number;
+	text: string;
+}
+
+/** How caption layout should interpret `CaptionSegment` times from `transcribeMono16kToSegments`. */
+export type CaptionTimestampGranularity = "word" | "phrase";
+
+export interface TranscribeMono16kResult {
+	segments: CaptionSegment[];
+	granularity: CaptionTimestampGranularity;
+}
+
+/** Request payload posted from the renderer to the transcription worker. */
+export interface TranscribeWorkerRequest {
+	samples: Float32Array;
+	trimRegions: TrimRegion[];
+}
+
+/** Messages the transcription worker posts back to the renderer. */
+export type TranscribeWorkerResponse =
+	| { type: "status"; phase: "model" | "transcribe" }
+	| { type: "result"; segments: CaptionSegment[]; granularity: CaptionTimestampGranularity }
+	| { type: "error"; message: string };
+
+/**
+ * Transcribes mono 16 kHz audio into timed caption segments using in-browser Whisper.
+ *
+ * The model load and inference run inside a dedicated Web Worker so the editor's
+ * main thread stays responsive (WASM inference does not yield). The first run
+ * downloads model weights. Aborting (via `options.signal`) terminates the worker
+ * immediately, since model load / inference cannot be cooperatively cancelled.
+ */
+export function transcribeMono16kToSegments(
+	samples: Float32Array,
+	options?: {
+		trimRegions?: TrimRegion[];
+		onStatus?: (phase: "model" | "transcribe") => void;
+		signal?: AbortSignal;
+	},
+): Promise<TranscribeMono16kResult> {
+	if (options?.signal?.aborted) {
+		return Promise.reject(new DOMException("Aborted", "AbortError"));
+	}
+
+	return new Promise<TranscribeMono16kResult>((resolve, reject) => {
+		const worker = new Worker(new URL("./transcribe.worker.ts", import.meta.url), {
+			type: "module",
+		});
+
+		let settled = false;
+		const finish = (fn: () => void) => {
+			if (settled) return;
+			settled = true;
+			options?.signal?.removeEventListener("abort", onAbort);
+			worker.terminate();
+			fn();
+		};
+
+		const onAbort = () => finish(() => reject(new DOMException("Aborted", "AbortError")));
+		options?.signal?.addEventListener("abort", onAbort, { once: true });
+
+		worker.onmessage = (e: MessageEvent<TranscribeWorkerResponse>) => {
+			const msg = e.data;
+			if (msg.type === "status") {
+				options?.onStatus?.(msg.phase);
+				return;
+			}
+			if (msg.type === "result") {
+				finish(() => resolve({ segments: msg.segments, granularity: msg.granularity }));
+				return;
+			}
+			finish(() => reject(new Error(msg.message)));
+		};
+
+		worker.onerror = (e) => {
+			finish(() => reject(new Error(e.message || "Caption transcription worker failed")));
+		};
+
+		// Structured-clone copy (not a transfer): the caller may reuse `samples`
+		// for the full-buffer retry pass, so the buffer must stay valid here.
+		const request: TranscribeWorkerRequest = {
+			samples,
+			trimRegions: options?.trimRegions ?? [],
+		};
+		worker.postMessage(request);
+	});
+}
diff --git a/src/lib/captioning/transcribe.worker.ts b/src/lib/captioning/transcribe.worker.ts
new file mode 100644
index 000000000..edd16e8ec
--- /dev/null
+++ b/src/lib/captioning/transcribe.worker.ts
@@ -0,0 +1,81 @@
+/**
+ * Web Worker: runs in-browser Whisper transcription off the renderer's main
+ * thread so the editor UI never blocks while the model loads or audio is
+ * transcribed.
+ *
+ * Input message:  { samples: Float32Array; trimRegions: TrimRegion[] }
+ * Output messages (see `TranscribeWorkerResponse`):
+ *   { type: "status", phase: "model" | "transcribe" }  progress updates
+ *   { type: "result", segments, granularity }          final captions
+ *   { type: "error", message }                          failure detail
+ *
+ * The caller terminates this worker to abort (model load / inference cannot be
+ * cooperatively cancelled), so there is no in-worker abort handling.
+ */
+
+import type { TranscribeWorkerRequest, TranscribeWorkerResponse } from "./transcribe";
+import { runTranscription, type TranscriberFn } from "./transcribeCore";
+
+function post(message: TranscribeWorkerResponse): void {
+	(self as unknown as Worker).postMessage(message);
+}
+
+/**
+ * ONNX Runtime's wasm bundle treats `process.versions.node` (which can leak into
+ * an Electron worker) as Node and tries `require("fs")`, which Vite does not
+ * support. Mask it only while Transformers / ORT run. No-op when `process` is
+ * undefined (the usual case in a Web Worker).
+ */
+function withoutNodeVersion<T>(fn: () => Promise<T>): Promise<T> {
+	const versions =
+		typeof process !== "undefined" && process.versions && typeof process.versions === "object"
+			? process.versions
+			: null;
+	const hadNode = versions !== null && "node" in versions;
+	const savedNode = hadNode ? (versions as { node?: string }).node : undefined;
+	if (hadNode && versions) {
+		try {
+			Reflect.deleteProperty(versions, "node");
+		} catch {
+			(versions as { node?: string }).node = undefined;
+		}
+	}
+	return fn().finally(() => {
+		if (hadNode && versions && savedNode !== undefined) {
+			(versions as { node: string }).node = savedNode;
+		}
+	});
+}
+
+async function loadTranscriber(): Promise<TranscriberFn> {
+	return withoutNodeVersion(async () => {
+		const { pipeline, env } = await import("@xenova/transformers");
+		env.allowLocalModels = false;
+		// Default tiny weights only: the `output_attentions` revision has regressed inference for
+		// some environments (empty chunks / thrown errors) while phrase mode works on this model.
+		const transcriber = (await pipeline(
+			"automatic-speech-recognition",
+			"Xenova/whisper-tiny",
+		)) as unknown as TranscriberFn;
+		return transcriber;
+	});
+}
+
+self.onmessage = async (event: MessageEvent<TranscribeWorkerRequest>) => {
+	const { samples, trimRegions } = event.data;
+	try {
+		post({ type: "status", phase: "model" });
+		const transcriber = await loadTranscriber();
+
+		post({ type: "status", phase: "transcribe" });
+		const { segments, granularity } = await runTranscription(
+			transcriber,
+			samples,
+			trimRegions ?? [],
+		);
+
+		post({ type: "result", segments, granularity });
+	} catch (e) {
+		post({ type: "error", message: e instanceof Error ? e.message : String(e) });
+	}
+};
diff --git a/src/lib/captioning/transcribeCore.ts b/src/lib/captioning/transcribeCore.ts
new file mode 100644
index 000000000..111995246
--- /dev/null
+++ b/src/lib/captioning/transcribeCore.ts
@@ -0,0 +1,269 @@
+import type { TrimRegion } from "@/components/video-editor/types";
+import type { CaptionSegment, TranscribeMono16kResult } from "./transcribe";
+
+/**
+ * Pure transcription algorithm shared by the captioning Web Worker. It takes an
+ * already-constructed Whisper `transcriber` and turns mono 16 kHz audio into
+ * timed caption segments. Kept free of DOM / Transformers.js imports so it can
+ * run inside a worker and be unit-tested in isolation.
+ */
+
+/** A Transformers.js automatic-speech-recognition pipeline call. */
+export type TranscriberFn = (
+	audio: Float32Array,
+	opts: Record<string, unknown>,
+) => Promise<unknown>;
+
+function segmentOverlapsTrim(startMs: number, endMs: number, trims: TrimRegion[]): boolean {
+	return trims.some((t) => startMs < t.endMs && endMs > t.startMs);
+}
+
+/** Same trim-out rule as {@link segmentsFromTranscriberChunks}; for retry passes that used empty trims. */
+function dropSegmentsOverlappingTrimRegions(
+	segments: CaptionSegment[],
+	trimRegions: TrimRegion[],
+): CaptionSegment[] {
+	if (trimRegions.length === 0) return segments;
+	return segments.filter((s) => {
+		const startMs = Math.round(s.startSec * 1000);
+		const endMs = Math.round(s.endSec * 1000);
+		return !segmentOverlapsTrim(startMs, endMs, trimRegions);
+	});
+}
+
+/** Whisper runs with internal 30s chunks; keep each forward pass bounded for WASM memory. */
+const TRANSCRIBE_SLICE_SAMPLES = 12 * 60 * 16_000;
+
+/** Very short slices are skipped in the multi-slice loop unless padded (see `padTailSliceForTranscribe`). */
+const MIN_TRANSCRIBE_SLICE_SAMPLES = 800;
+
+/**
+ * Pad a short tail slice so Whisper still runs; timestamps are clamped with `realDurationSec` so
+ * padding does not extend perceived audio on the timeline.
+ */
+function padTailSliceForTranscribe(samples: Float32Array): {
+	slice: Float32Array;
+	realDurationSec: number;
+} {
+	const realDurationSec = samples.length / 16_000;
+	if (samples.length >= MIN_TRANSCRIBE_SLICE_SAMPLES) {
+		return { slice: samples, realDurationSec };
+	}
+	const padded = new Float32Array(MIN_TRANSCRIBE_SLICE_SAMPLES);
+	padded.set(samples);
+	return { slice: padded, realDurationSec };
+}
+
+/** Converts raw Whisper chunk output into sorted, deduped, trim-filtered caption segments. */
+function segmentsFromTranscriberChunks(
+	chunks: Array<{ timestamp?: [number | null, number | null]; text?: unknown }>,
+	timeOffsetSec: number,
+	trims: TrimRegion[],
+	audioDurationSec: number,
+): CaptionSegment[] {
+	const sorted = [...chunks].sort((x, y) => {
+		const ax = x.timestamp?.[0];
+		const ay = y.timestamp?.[0];
+		const na = typeof ax === "number" ? ax : -1;
+		const nb = typeof ay === "number" ? ay : -1;
+		return na - nb;
+	});
+
+	const segments: CaptionSegment[] = [];
+
+	for (let idx = 0; idx < sorted.length; idx++) {
+		const c = sorted[idx]!;
+		const ts = c.timestamp as [number | null, number | null] | undefined;
+		if (!ts) continue;
+		let a = ts[0];
+		let b = ts[1];
+		if (a == null) a = 0;
+		a = Math.max(0, a);
+		if (b == null) {
+			let nextStart: number | null = null;
+			for (let j = idx + 1; j < sorted.length; j++) {
+				const na = sorted[j]?.timestamp?.[0];
+				if (typeof na === "number") {
+					nextStart = na;
+					break;
+				}
+			}
+			b = nextStart ?? audioDurationSec;
+		}
+		if (b <= a) {
+			b = Math.min(a + 0.25, audioDurationSec);
+		}
+		b = Math.min(b, audioDurationSec);
+
+		const text = String(c.text ?? "")
+			.replace(/\s+/g, " ")
+			.trim();
+		if (!text) continue;
+
+		const startSec = a + timeOffsetSec;
+		const sliceEnd = timeOffsetSec + audioDurationSec;
+		const endSec = Math.min(Math.max(startSec + 0.08, b + timeOffsetSec), sliceEnd);
+		const startMs = Math.round(startSec * 1000);
+		const endMs = Math.round(endSec * 1000);
+		if (segmentOverlapsTrim(startMs, endMs, trims)) continue;
+
+		segments.push({ startSec, endSec, text });
+	}
+
+	segments.sort((u, v) => u.startSec - v.startSec || u.endSec - v.endSec);
+	const rawDeduped: CaptionSegment[] = [];
+	for (const seg of segments) {
+		const prev = rawDeduped[rawDeduped.length - 1];
+		if (prev && prev.text === seg.text && seg.startSec <= prev.endSec) {
+			prev.endSec = Math.max(prev.endSec, seg.endSec);
+			prev.startSec = Math.min(prev.startSec, seg.startSec);
+			continue;
+		}
+		rawDeduped.push(seg);
+	}
+	return rawDeduped;
+}
+
+/** Runs the transcriber on one audio slice, chunking only long clips. */
+async function runTranscriberOnSlice(
+	transcriber: TranscriberFn,
+	samples: Float32Array,
+	opts: { forceFullSequences: boolean; timestampMode: "word" | "phrase" },
+): Promise<unknown> {
+	const durationSec = samples.length / 16_000;
+	// Only chunk long clips; short-audio chunking regressed some Whisper.js runs (empty chunks).
+	const chunking = durationSec > 30 ? { chunk_length_s: 30, stride_length_s: 5 } : {};
+	return transcriber(samples, {
+		return_timestamps: opts.timestampMode === "word" ? "word" : true,
+		force_full_sequences: opts.forceFullSequences,
+		...chunking,
+	});
+}
+
+/** Flattens the various shapes a Transformers.js ASR result can take into a chunk list. */
+function getChunksFromTranscriberResult(result: unknown): Array<{
+	timestamp?: [number | null, number | null];
+	text?: unknown;
+}> {
+	if (result == null) return [];
+	if (Array.isArray(result)) {
+		const out: Array<{ timestamp?: [number | null, number | null]; text?: unknown }> = [];
+		for (const item of result) {
+			const chunks = (item as { chunks?: unknown })?.chunks;
+			if (Array.isArray(chunks)) out.push(...chunks);
+		}
+		return out;
+	}
+	const chunks = (result as { chunks?: unknown })?.chunks;
+	return Array.isArray(chunks) ? chunks : [];
+}
+
+/** Prefer `chunks`; if the model only returned top-level `text`, synthesize one span for timing. */
+function extractChunksFromAsrResult(result: unknown): Array<{
+	timestamp?: [number | null, number | null];
+	text?: unknown;
+}> {
+	const fromChunks = getChunksFromTranscriberResult(result);
+	if (fromChunks.length > 0) return fromChunks;
+	const single = Array.isArray(result) ? result[0] : result;
+	const text =
+		typeof (single as { text?: unknown })?.text === "string"
+			? String((single as { text: string }).text).trim()
+			: "";
+	if (text) {
+		return [{ timestamp: [0, null], text }];
+	}
+	return [];
+}
+
+/**
+ * Drives Whisper over (possibly sliced) mono 16 kHz audio and returns timed segments.
+ * Long audio is split so one forward pass does not exhaust WASM memory; timestamps are
+ * shifted back onto the full timeline. Tries word- then phrase-level timestamps, with a
+ * trim-ignoring retry, before giving up.
+ */
+export async function runTranscription(
+	transcriber: TranscriberFn,
+	samples: Float32Array,
+	trims: TrimRegion[],
+): Promise<TranscribeMono16kResult> {
+	const transcribeOne = async (
+		ignoreTrims: boolean,
+		forceFullSequences: boolean,
+		timestampMode: "word" | "phrase",
+	): Promise<CaptionSegment[]> => {
+		try {
+			const activeTrims = ignoreTrims ? [] : trims;
+			if (samples.length <= TRANSCRIBE_SLICE_SAMPLES) {
+				const { slice, realDurationSec } = padTailSliceForTranscribe(samples);
+				const result = await runTranscriberOnSlice(transcriber, slice, {
+					forceFullSequences,
+					timestampMode,
+				});
+				return segmentsFromTranscriberChunks(
+					extractChunksFromAsrResult(result),
+					0,
+					activeTrims,
+					realDurationSec,
+				);
+			}
+
+			const all: CaptionSegment[] = [];
+			for (let offset = 0; offset < samples.length; offset += TRANSCRIBE_SLICE_SAMPLES) {
+				const end = Math.min(offset + TRANSCRIBE_SLICE_SAMPLES, samples.length);
+				const sliceRaw = samples.subarray(offset, end);
+				const isFinalSlice = end >= samples.length;
+				if (sliceRaw.length === 0) continue;
+				if (sliceRaw.length < MIN_TRANSCRIBE_SLICE_SAMPLES && !isFinalSlice) continue;
+
+				const { slice, realDurationSec } =
+					sliceRaw.length < MIN_TRANSCRIBE_SLICE_SAMPLES && isFinalSlice
+						? padTailSliceForTranscribe(sliceRaw)
+						: { slice: sliceRaw, realDurationSec: sliceRaw.length / 16_000 };
+
+				const result = await runTranscriberOnSlice(transcriber, slice, {
+					forceFullSequences,
+					timestampMode,
+				});
+				const tOff = offset / 16_000;
+				all.push(
+					...segmentsFromTranscriberChunks(
+						extractChunksFromAsrResult(result),
+						tOff,
+						activeTrims,
+						realDurationSec,
+					),
+				);
+			}
+			return all;
+		} catch (e) {
+			console.warn("[captioning] Whisper pass failed:", e);
+			return [];
+		}
+	};
+
+	const attemptModes: Array<"word" | "phrase"> = ["word", "phrase"];
+	for (const timestampMode of attemptModes) {
+		let segments = await transcribeOne(false, true, timestampMode);
+		if (segments.length === 0) {
+			segments = await transcribeOne(false, false, timestampMode);
+		}
+		if (segments.length === 0 && trims.length > 0) {
+			segments = dropSegmentsOverlappingTrimRegions(
+				await transcribeOne(true, true, timestampMode),
+				trims,
+			);
+			if (segments.length === 0) {
+				segments = dropSegmentsOverlappingTrimRegions(
+					await transcribeOne(true, false, timestampMode),
+					trims,
+				);
+			}
+		}
+		if (segments.length > 0) {
+			return { segments, granularity: timestampMode };
+		}
+	}
+
+	return { segments: [], granularity: "phrase" };
+}
diff --git a/src/lib/vite-stubs/empty-node-module.ts b/src/lib/vite-stubs/empty-node-module.ts
new file mode 100644
index 000000000..16ee52688
--- /dev/null
+++ b/src/lib/vite-stubs/empty-node-module.ts
@@ -0,0 +1,7 @@
+/**
+ * Default export with no enumerable keys. Used as a Vite alias target for Node
+ * builtins that `@xenova/transformers` imports; `env.js` treats an empty object
+ * as “no filesystem” so it stays on browser / remote paths.
+ */
+const empty = Object.create(null) as Record<string, never>;
+export default empty;
diff --git a/src/lib/vite-stubs/onnxruntime-node-stub.ts b/src/lib/vite-stubs/onnxruntime-node-stub.ts
new file mode 100644
index 000000000..a70b3dd60
--- /dev/null
+++ b/src/lib/vite-stubs/onnxruntime-node-stub.ts
@@ -0,0 +1,10 @@
+/**
+ * Transformers always imports `onnxruntime-node`, then picks web vs node from `process.release.name`.
+ * In Electron's renderer that name is often `"node"` while we still must use the WASM build — the real
+ * `onnxruntime-node` package is aliased away (it pulls `fs`). Re-export `onnxruntime-web` here so the
+ * "node" branch still receives a working ORT with `registerBackend` etc.
+ */
+import * as ortWeb from "onnxruntime-web";
+
+const ort = (ortWeb as { default?: typeof ortWeb }).default ?? ortWeb;
+export default ort;
diff --git a/vite.config.ts b/vite.config.ts
index 0779e1358..213e44711 100644
--- a/vite.config.ts
+++ b/vite.config.ts
@@ -28,8 +28,22 @@ export default defineConfig({
 	resolve: {
 		alias: {
 			"@": path.resolve(__dirname, "src"),
+			// @xenova/transformers: env.js statically imports fs/path/url; onnx.js imports
+			// onnxruntime-node (must not be bundled in the renderer — it requires fs).
+			fs: path.resolve(__dirname, "src/lib/vite-stubs/empty-node-module.ts"),
+			path: path.resolve(__dirname, "src/lib/vite-stubs/empty-node-module.ts"),
+			url: path.resolve(__dirname, "src/lib/vite-stubs/empty-node-module.ts"),
+			"onnxruntime-node": path.resolve(__dirname, "src/lib/vite-stubs/onnxruntime-node-stub.ts"), // re-exports web ORT
 		},
 	},
+	optimizeDeps: {
+		exclude: ["@xenova/transformers"],
+	},
+	// The captioning worker dynamically imports @xenova/transformers, which makes the
+	// worker bundle code-split — unsupported by the default "iife" worker format.
+	worker: {
+		format: "es",
+	},
 	build: {
 		target: "esnext",
 		minify: "terser",