TanStack · joksas · May 26, 2026 · May 26, 2026 · May 27, 2026 · May 27, 2026
diff --git a/.changeset/feat-groq-ai-transcription.md b/.changeset/feat-groq-ai-transcription.md
@@ -0,0 +1,7 @@
+---
+'@tanstack/ai-groq': minor
+---
+
+Adds Groq as a transcription provider. Groq's API is mostly OpenAI SDK-compatible,
+but its transcription endpoint additionally accepts HTTP URLs as input, so this
+is implemented as a custom integration rather than going through the SDK.
diff --git a/packages/ai-groq/src/adapters/transcription.ts b/packages/ai-groq/src/adapters/transcription.ts
@@ -0,0 +1,292 @@
+import { BaseTranscriptionAdapter } from '@tanstack/ai/adapters'
+import { base64ToArrayBuffer, generateId } from '@tanstack/ai-utils'
+import { getGroqApiKeyFromEnv, withGroqDefaults } from '../utils/client'
+import type {
+  TranscriptionOptions,
+  TranscriptionResult,
+  TranscriptionSegment,
+} from '@tanstack/ai'
+import type { GroqTranscriptionModel } from '../model-meta'
+import type { GroqTranscriptionProviderOptions } from '../audio/transcription-provider-options'
+import type { GroqClientConfig } from '../utils/client'
+
+/**
+ * Configuration for the Groq Transcription adapter.
+ */
+export interface GroqTranscriptionConfig extends GroqClientConfig {}
+
+// Shape of Groq's verbose_json transcription response
+interface GroqVerboseTranscriptionResponse {
+  task?: string
+  language?: string
+  duration?: number
+  text: string
+  segments?: Array<{
+    id: number
+    seek?: number
+    start: number
+    end: number
+    text: string
+    tokens?: Array<number>
+    temperature?: number
+    avg_logprob: number
+    compression_ratio?: number
+    no_speech_prob?: number
+  }>
+  words?: Array<{ word: string; start: number; end: number }>
+  x_groq?: { id?: string }
+}
+
+// Shape of Groq's json transcription response
+interface GroqJsonTranscriptionResponse {
+  text: string
+  x_groq?: { id?: string }
+}
+
+/**
+ * Groq Transcription (Speech-to-Text) Adapter
+ *
+ * Tree-shakeable adapter for Groq audio transcription. Supports
+ * whisper-large-v3 and whisper-large-v3-turbo.
+ *
+ * Features:
+ * - Audio file uploads (File, Blob, ArrayBuffer, base64/data URL)
+ * - Remote audio URLs passed directly via Groq's `url` field — no upload needed
+ * - Verbose JSON response with segment and word timestamps
+ * - Language detection or specification (ISO-639-1)
+ * - Confidence scores derived from segment avg_logprob
+ */
+export class GroqTranscriptionAdapter<
+  TModel extends GroqTranscriptionModel,
+> extends BaseTranscriptionAdapter<TModel, GroqTranscriptionProviderOptions> {
+  readonly name = 'groq' as const
+
+  private readonly apiKey: string
+  private readonly baseURL: string
+
+  constructor(config: GroqTranscriptionConfig, model: TModel) {
+    super(model, {})
+    const resolved = withGroqDefaults(config)
+    this.apiKey = resolved.apiKey
+    this.baseURL = resolved.baseURL ?? 'https://api.groq.com/openai/v1'
+  }
+
+  async transcribe(
+    options: TranscriptionOptions<GroqTranscriptionProviderOptions>,
+  ): Promise<TranscriptionResult> {
+    const { model, audio, language, prompt, responseFormat, modelOptions } =
+      options
+
+    // Default to verbose_json so callers get language, duration, and timestamps
+    // without having to opt in explicitly. Both Groq whisper models support it.
+    const useVerbose = !responseFormat || responseFormat === 'verbose_json'
+    const effectiveFormat = responseFormat ?? 'verbose_json'
+
+    const form = new FormData()
+    form.append('model', model)
+    form.append('response_format', effectiveFormat)
+    if (language !== undefined) form.append('language', language)
+    if (prompt !== undefined) form.append('prompt', prompt)
+    if (modelOptions?.temperature !== undefined) {
+      form.append('temperature', String(modelOptions.temperature))
+    }
+    if (modelOptions?.timestamp_granularities !== undefined) {
+      for (const g of modelOptions.timestamp_granularities) {
+        form.append('timestamp_granularities[]', g)
+      }
+    }
+
+    // HTTP/HTTPS URLs are forwarded directly via Groq's `url` field, which
+    // avoids a round-trip upload. All other inputs (File, Blob, ArrayBuffer,
+    // base64, data URL) are converted to a File and sent as `file`.
+    if (typeof audio === 'string' && /^https?:\/\//.test(audio)) {
+      form.append('url', audio)
+    } else {
+      form.append('file', this.prepareAudioFile(audio))
+    }
+
+    try {
+      options.logger.request(
+        `activity=transcription provider=${this.name} model=${model} verbose=${useVerbose}`,
+        { provider: this.name, model },
+      )
+
+      const response = await fetch(`${this.baseURL}/audio/transcriptions`, {
+        method: 'POST',
+        headers: { Authorization: `Bearer ${this.apiKey}` },
+        body: form,
+      })
+
+      if (!response.ok) {
+        const body = await response
+          .json()
+          .catch(() => null as Record<string, unknown> | null)
+        const message =
+          (body?.error as { message?: string } | undefined)?.message ??
+          `Groq API error ${response.status}`
+        throw new Error(message)
+      }
+
+      if (useVerbose) {
+        const data = (await response.json()) as GroqVerboseTranscriptionResponse
+        const requestId = data.x_groq?.id ?? generateId(this.name)
+
+        // `TranscriptionResult` declares optional fields without `| undefined`,
+        // so under exactOptionalPropertyTypes we must omit absent fields rather
+        // than assigning `undefined`.
+        const segments = data.segments?.map(
+          (seg): TranscriptionSegment => ({
+            id: seg.id,
+            start: seg.start,
+            end: seg.end,
+            text: seg.text,
+            confidence: Math.exp(seg.avg_logprob),
+          }),
+        )
+        const words = data.words?.map((w) => ({
+          word: w.word,
+          start: w.start,
+          end: w.end,
+        }))
+
+        return {
+          id: requestId,
+          model,
+          text: data.text,
+          ...(data.language !== undefined && { language: data.language }),
+          ...(data.duration !== undefined && { duration: data.duration }),
+          ...(segments !== undefined && { segments }),
+          ...(words !== undefined && { words }),
+        }
+      } else if (effectiveFormat === 'text') {
+        const text = await response.text()
+        return {
+          id: generateId(this.name),
+          model,
+          text,
+          ...(language !== undefined && { language }),
+        }
+      } else {
+        const data = (await response.json()) as GroqJsonTranscriptionResponse
+        return {
+          id: data.x_groq?.id ?? generateId(this.name),
+          model,
+          text: data.text,
+          ...(language !== undefined && { language }),
+        }
+      }
+    } catch (error: unknown) {
+      options.logger.errors(`${this.name}.transcribe fatal`, {
+        error,
+        source: `${this.name}.transcribe`,
+      })
+      throw error
+    }
+  }
+
+  private prepareAudioFile(audio: string | File | Blob | ArrayBuffer): File {
+    if (typeof File !== 'undefined' && audio instanceof File) {
+      return audio
+    }
+    if (typeof Blob !== 'undefined' && audio instanceof Blob) {
+      this.ensureFileSupport()
+      return new File([audio], 'audio.mp3', {
+        type: audio.type || 'audio/mpeg',
+      })
+    }
+    if (typeof ArrayBuffer !== 'undefined' && audio instanceof ArrayBuffer) {
+      this.ensureFileSupport()
+      return new File([audio], 'audio.mp3', { type: 'audio/mpeg' })
+    }
+    if (typeof audio === 'string') {
+      this.ensureFileSupport()
+
+      if (audio.startsWith('data:')) {
+        const parts = audio.split(',')
+        const header = parts[0]
+        const base64Data = parts[1] || ''
+        const mimeMatch = header?.match(/data:([^;]+)/)
+        const mimeType = mimeMatch?.[1] || 'audio/mpeg'
+        const bytes = base64ToArrayBuffer(base64Data)
+        const extension = mimeType.split('/')[1] || 'mp3'
+        return new File([bytes], `audio.${extension}`, { type: mimeType })
+      }
+
+      const bytes = base64ToArrayBuffer(audio)
+      return new File([bytes], 'audio.mp3', { type: 'audio/mpeg' })
+    }
+
+    throw new Error('Invalid audio input type')
+  }
+
+  // Throws on Node < 20 where the global `File` constructor is unavailable.
+  private ensureFileSupport(): void {
+    if (typeof File === 'undefined') {
+      throw new Error(
+        '`File` is not available in this environment. ' +
+          'Use Node.js 20 or newer, or pass a File object directly.',
+      )
+    }
+  }
+}
+
+/**
+ * Creates a Groq transcription adapter with an explicit API key.
+ * Type resolution happens here at the call site.
+ *
+ * @param model - The model name (e.g., 'whisper-large-v3-turbo')
+ * @param apiKey - Your Groq API key
+ * @param config - Optional additional configuration
+ * @returns Configured Groq transcription adapter instance
+ *
+ * @example
+ * ```typescript
+ * const adapter = createGroqTranscription('whisper-large-v3-turbo', 'gsk_...');
+ *
+ * const result = await generateTranscription({
+ *   adapter,
+ *   audio: audioFile,
+ *   language: 'en',
+ * });
+ * ```
+ */
+export function createGroqTranscription<TModel extends GroqTranscriptionModel>(
+  model: TModel,
+  apiKey: string,
+  config?: Omit<GroqTranscriptionConfig, 'apiKey'>,
+): GroqTranscriptionAdapter<TModel> {
+  return new GroqTranscriptionAdapter({ apiKey, ...config }, model)
+}
+
+/**
+ * Creates a Groq transcription adapter using the `GROQ_API_KEY` environment
+ * variable. Type resolution happens here at the call site.
+ *
+ * Looks for `GROQ_API_KEY` in:
+ * - `process.env` (Node.js)
+ * - `window.env` (browser with injected env)
+ *
+ * @param model - The model name (e.g., 'whisper-large-v3-turbo')
+ * @param config - Optional configuration (excluding apiKey which is auto-detected)
+ * @returns Configured Groq transcription adapter instance
+ * @throws Error if GROQ_API_KEY is not found in environment
+ *
+ * @example
+ * ```typescript
+ * const adapter = groqTranscription('whisper-large-v3-turbo');
+ *
+ * const result = await generateTranscription({
+ *   adapter,
+ *   audio: 'https://example.com/audio.mp3',
+ * });
+ *
+ * console.log(result.text)
+ * ```
+ */
+export function groqTranscription<TModel extends GroqTranscriptionModel>(
+  model: TModel,
+  config?: Omit<GroqTranscriptionConfig, 'apiKey'>,
+): GroqTranscriptionAdapter<TModel> {
+  const apiKey = getGroqApiKeyFromEnv()
+  return createGroqTranscription(model, apiKey, config)
+}
diff --git a/packages/ai-groq/src/audio/transcription-provider-options.ts b/packages/ai-groq/src/audio/transcription-provider-options.ts
@@ -0,0 +1,20 @@
+/**
+ * Groq-specific options for audio transcription.
+ *
+ * These fields extend the shared `TranscriptionOptions` and are forwarded
+ * verbatim to the Groq transcription endpoint.
+ */
+export interface GroqTranscriptionProviderOptions {
+  /**
+   * Sampling temperature between 0 and 1. Lower values produce more
+   * deterministic output. Groq recommends 0 (the default) for most use cases.
+   */
+  temperature?: number
+
+  /**
+   * Granularity levels to include when `response_format` is `verbose_json`.
+   * Pass `['word']`, `['segment']`, or both to control which timestamp arrays
+   * appear in the result.
+   */
+  timestamp_granularities?: Array<'word' | 'segment'>
+}
diff --git a/packages/ai-groq/src/index.ts b/packages/ai-groq/src/index.ts
@@ -14,6 +14,15 @@ export {
   type GroqTextProviderOptions,
 } from './adapters/text'
 
+// Transcription adapter
+export {
+  GroqTranscriptionAdapter,
+  createGroqTranscription,
+  groqTranscription,
+  type GroqTranscriptionConfig,
+} from './adapters/transcription'
+export type { GroqTranscriptionProviderOptions } from './audio/transcription-provider-options'
+
 // Types
 export type {
   GroqChatModelProviderOptionsByName,
@@ -22,8 +31,9 @@ export type {
   ResolveProviderOptions,
   ResolveInputModalities,
   GroqChatModels,
+  GroqTranscriptionModel,
 } from './model-meta'
-export { GROQ_CHAT_MODELS } from './model-meta'
+export { GROQ_CHAT_MODELS, GROQ_TRANSCRIPTION_MODELS } from './model-meta'
 export type {
   GroqTextMetadata,
   GroqImageMetadata,

diff --git a/packages/ai-groq/src/model-meta.ts b/packages/ai-groq/src/model-meta.ts
@@ -402,3 +402,16 @@ export type ResolveInputModalities<TModel extends string> =
   TModel extends keyof GroqModelInputModalitiesByName
     ? GroqModelInputModalitiesByName[TModel]
     : readonly ['text']
+
+/**
+ * All supported Groq transcription model identifiers.
+ */
+export const GROQ_TRANSCRIPTION_MODELS = [
+  'whisper-large-v3-turbo',
+  'whisper-large-v3',
+] as const
+
+/**
+ * Union type of all supported Groq transcription model names.
+ */
+export type GroqTranscriptionModel = (typeof GROQ_TRANSCRIPTION_MODELS)[number]
diff --git a/testing/e2e/src/lib/feature-support.ts b/testing/e2e/src/lib/feature-support.ts
@@ -165,7 +165,7 @@ export const matrix: Record<Feature, Set<Provider>> = {
   'audio-gen': new Set(['gemini', 'elevenlabs']),
   'sound-effects': new Set(['elevenlabs']),
   tts: new Set(['openai', 'grok', 'elevenlabs']),
-  transcription: new Set(['openai', 'grok', 'elevenlabs']),
+  transcription: new Set(['openai', 'grok', 'groq', 'elevenlabs']),
   'video-gen': new Set(['openai']),
   // Only Gemini currently surfaces a first-class stateful conversation API via
   // the adapter (geminiTextInteractions, behind @tanstack/ai-gemini/experimental).

diff --git a/testing/e2e/src/lib/media-providers.ts b/testing/e2e/src/lib/media-providers.ts
@@ -10,6 +10,7 @@ import {
   createGrokSpeech,
   createGrokTranscription,
 } from '@tanstack/ai-grok'
+import { createGroqTranscription } from '@tanstack/ai-groq'
 import {
   createElevenLabsAudio,
   createElevenLabsSpeech,
@@ -109,6 +110,11 @@ export function createTranscriptionAdapter(
         baseURL: openaiUrl(aimockPort),
         defaultHeaders: headers,
       }),
+    groq: () =>
+      createGroqTranscription('whisper-large-v3-turbo', DUMMY_KEY, {
+        baseURL: openaiUrl(aimockPort),
+        defaultHeaders: headers,
+      }),
     elevenlabs: () =>
       createElevenLabsTranscription('scribe_v1', DUMMY_KEY, {
         baseUrl: llmockBase(aimockPort),