Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .changeset/feat-groq-ai-transcription.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
'@tanstack/ai-groq': minor
---

Adds Groq as a transcription provider. Groq's API is mostly OpenAI SDK-compatible,
but its transcription endpoint additionally accepts HTTP URLs as input, so this
is implemented as a custom integration rather than going through the SDK.
292 changes: 292 additions & 0 deletions packages/ai-groq/src/adapters/transcription.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,292 @@
import { BaseTranscriptionAdapter } from '@tanstack/ai/adapters'
import { base64ToArrayBuffer, generateId } from '@tanstack/ai-utils'
import { getGroqApiKeyFromEnv, withGroqDefaults } from '../utils/client'
import type {
TranscriptionOptions,
TranscriptionResult,
TranscriptionSegment,
} from '@tanstack/ai'
import type { GroqTranscriptionModel } from '../model-meta'
import type { GroqTranscriptionProviderOptions } from '../audio/transcription-provider-options'
import type { GroqClientConfig } from '../utils/client'

/**
* Configuration for the Groq Transcription adapter.
*/
export interface GroqTranscriptionConfig extends GroqClientConfig {}

// Shape of Groq's verbose_json transcription response
interface GroqVerboseTranscriptionResponse {
task?: string
language?: string
duration?: number
text: string
segments?: Array<{
id: number
seek?: number
start: number
end: number
text: string
tokens?: Array<number>
temperature?: number
avg_logprob: number
compression_ratio?: number
no_speech_prob?: number
}>
words?: Array<{ word: string; start: number; end: number }>
x_groq?: { id?: string }
}

// Shape of Groq's json transcription response
interface GroqJsonTranscriptionResponse {
text: string
x_groq?: { id?: string }
}

/**
* Groq Transcription (Speech-to-Text) Adapter
*
* Tree-shakeable adapter for Groq audio transcription. Supports
* whisper-large-v3 and whisper-large-v3-turbo.
*
* Features:
* - Audio file uploads (File, Blob, ArrayBuffer, base64/data URL)
* - Remote audio URLs passed directly via Groq's `url` field — no upload needed
* - Verbose JSON response with segment and word timestamps
* - Language detection or specification (ISO-639-1)
* - Confidence scores derived from segment avg_logprob
*/
export class GroqTranscriptionAdapter<
TModel extends GroqTranscriptionModel,
> extends BaseTranscriptionAdapter<TModel, GroqTranscriptionProviderOptions> {
readonly name = 'groq' as const

private readonly apiKey: string
private readonly baseURL: string

constructor(config: GroqTranscriptionConfig, model: TModel) {
super(model, {})
const resolved = withGroqDefaults(config)
this.apiKey = resolved.apiKey
this.baseURL = resolved.baseURL ?? 'https://api.groq.com/openai/v1'
}

async transcribe(
options: TranscriptionOptions<GroqTranscriptionProviderOptions>,
): Promise<TranscriptionResult> {
const { model, audio, language, prompt, responseFormat, modelOptions } =
options

// Default to verbose_json so callers get language, duration, and timestamps
// without having to opt in explicitly. Both Groq whisper models support it.
const useVerbose = !responseFormat || responseFormat === 'verbose_json'
const effectiveFormat = responseFormat ?? 'verbose_json'

const form = new FormData()
form.append('model', model)
form.append('response_format', effectiveFormat)
if (language !== undefined) form.append('language', language)
if (prompt !== undefined) form.append('prompt', prompt)
if (modelOptions?.temperature !== undefined) {
form.append('temperature', String(modelOptions.temperature))
}
if (modelOptions?.timestamp_granularities !== undefined) {
for (const g of modelOptions.timestamp_granularities) {
form.append('timestamp_granularities[]', g)
}
}

// HTTP/HTTPS URLs are forwarded directly via Groq's `url` field, which
// avoids a round-trip upload. All other inputs (File, Blob, ArrayBuffer,
// base64, data URL) are converted to a File and sent as `file`.
if (typeof audio === 'string' && /^https?:\/\//.test(audio)) {
form.append('url', audio)
} else {
form.append('file', this.prepareAudioFile(audio))
}

try {
options.logger.request(
`activity=transcription provider=${this.name} model=${model} verbose=${useVerbose}`,
{ provider: this.name, model },
)

const response = await fetch(`${this.baseURL}/audio/transcriptions`, {
method: 'POST',
headers: { Authorization: `Bearer ${this.apiKey}` },
body: form,
})

if (!response.ok) {
const body = await response
.json()
.catch(() => null as Record<string, unknown> | null)
const message =
(body?.error as { message?: string } | undefined)?.message ??
`Groq API error ${response.status}`
throw new Error(message)
}

if (useVerbose) {
const data = (await response.json()) as GroqVerboseTranscriptionResponse
const requestId = data.x_groq?.id ?? generateId(this.name)

// `TranscriptionResult` declares optional fields without `| undefined`,
// so under exactOptionalPropertyTypes we must omit absent fields rather
// than assigning `undefined`.
const segments = data.segments?.map(
(seg): TranscriptionSegment => ({
id: seg.id,
start: seg.start,
end: seg.end,
text: seg.text,
confidence: Math.exp(seg.avg_logprob),
}),
)
const words = data.words?.map((w) => ({
word: w.word,
start: w.start,
end: w.end,
}))

return {
id: requestId,
model,
text: data.text,
...(data.language !== undefined && { language: data.language }),
...(data.duration !== undefined && { duration: data.duration }),
...(segments !== undefined && { segments }),
...(words !== undefined && { words }),
}
} else if (effectiveFormat === 'text') {
const text = await response.text()
return {
id: generateId(this.name),
model,
text,
...(language !== undefined && { language }),
}
} else {
const data = (await response.json()) as GroqJsonTranscriptionResponse
Comment thread
coderabbitai[bot] marked this conversation as resolved.
return {
id: data.x_groq?.id ?? generateId(this.name),
model,
text: data.text,
...(language !== undefined && { language }),
}
}
} catch (error: unknown) {
options.logger.errors(`${this.name}.transcribe fatal`, {
error,
source: `${this.name}.transcribe`,
})
throw error
}
}

private prepareAudioFile(audio: string | File | Blob | ArrayBuffer): File {
if (typeof File !== 'undefined' && audio instanceof File) {
return audio
}
if (typeof Blob !== 'undefined' && audio instanceof Blob) {
this.ensureFileSupport()
return new File([audio], 'audio.mp3', {
type: audio.type || 'audio/mpeg',
})
}
if (typeof ArrayBuffer !== 'undefined' && audio instanceof ArrayBuffer) {
this.ensureFileSupport()
return new File([audio], 'audio.mp3', { type: 'audio/mpeg' })
}
if (typeof audio === 'string') {
this.ensureFileSupport()

if (audio.startsWith('data:')) {
const parts = audio.split(',')
const header = parts[0]
const base64Data = parts[1] || ''
const mimeMatch = header?.match(/data:([^;]+)/)
const mimeType = mimeMatch?.[1] || 'audio/mpeg'
const bytes = base64ToArrayBuffer(base64Data)
const extension = mimeType.split('/')[1] || 'mp3'
return new File([bytes], `audio.${extension}`, { type: mimeType })
}

const bytes = base64ToArrayBuffer(audio)
return new File([bytes], 'audio.mp3', { type: 'audio/mpeg' })
}

throw new Error('Invalid audio input type')
}

// Throws on Node < 20 where the global `File` constructor is unavailable.
private ensureFileSupport(): void {
if (typeof File === 'undefined') {
throw new Error(
'`File` is not available in this environment. ' +
'Use Node.js 20 or newer, or pass a File object directly.',
)
}
}
}

/**
* Creates a Groq transcription adapter with an explicit API key.
* Type resolution happens here at the call site.
*
* @param model - The model name (e.g., 'whisper-large-v3-turbo')
* @param apiKey - Your Groq API key
* @param config - Optional additional configuration
* @returns Configured Groq transcription adapter instance
*
* @example
* ```typescript
* const adapter = createGroqTranscription('whisper-large-v3-turbo', 'gsk_...');
*
* const result = await generateTranscription({
* adapter,
* audio: audioFile,
* language: 'en',
* });
* ```
*/
export function createGroqTranscription<TModel extends GroqTranscriptionModel>(
model: TModel,
apiKey: string,
config?: Omit<GroqTranscriptionConfig, 'apiKey'>,
): GroqTranscriptionAdapter<TModel> {
return new GroqTranscriptionAdapter({ apiKey, ...config }, model)
}

/**
* Creates a Groq transcription adapter using the `GROQ_API_KEY` environment
* variable. Type resolution happens here at the call site.
*
* Looks for `GROQ_API_KEY` in:
* - `process.env` (Node.js)
* - `window.env` (browser with injected env)
*
* @param model - The model name (e.g., 'whisper-large-v3-turbo')
* @param config - Optional configuration (excluding apiKey which is auto-detected)
* @returns Configured Groq transcription adapter instance
* @throws Error if GROQ_API_KEY is not found in environment
*
* @example
* ```typescript
* const adapter = groqTranscription('whisper-large-v3-turbo');
*
* const result = await generateTranscription({
* adapter,
* audio: 'https://example.com/audio.mp3',
* });
*
* console.log(result.text)
* ```
*/
export function groqTranscription<TModel extends GroqTranscriptionModel>(
model: TModel,
config?: Omit<GroqTranscriptionConfig, 'apiKey'>,
): GroqTranscriptionAdapter<TModel> {
const apiKey = getGroqApiKeyFromEnv()
return createGroqTranscription(model, apiKey, config)
}
20 changes: 20 additions & 0 deletions packages/ai-groq/src/audio/transcription-provider-options.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/**
* Groq-specific options for audio transcription.
*
* These fields extend the shared `TranscriptionOptions` and are forwarded
* verbatim to the Groq transcription endpoint.
*/
export interface GroqTranscriptionProviderOptions {
/**
* Sampling temperature between 0 and 1. Lower values produce more
* deterministic output. Groq recommends 0 (the default) for most use cases.
*/
temperature?: number

/**
* Granularity levels to include when `response_format` is `verbose_json`.
* Pass `['word']`, `['segment']`, or both to control which timestamp arrays
* appear in the result.
*/
timestamp_granularities?: Array<'word' | 'segment'>
}
12 changes: 11 additions & 1 deletion packages/ai-groq/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@ export {
type GroqTextProviderOptions,
} from './adapters/text'

// Transcription adapter
export {
GroqTranscriptionAdapter,
createGroqTranscription,
groqTranscription,
type GroqTranscriptionConfig,
} from './adapters/transcription'
export type { GroqTranscriptionProviderOptions } from './audio/transcription-provider-options'

// Types
export type {
GroqChatModelProviderOptionsByName,
Expand All @@ -22,8 +31,9 @@ export type {
ResolveProviderOptions,
ResolveInputModalities,
GroqChatModels,
GroqTranscriptionModel,
} from './model-meta'
export { GROQ_CHAT_MODELS } from './model-meta'
export { GROQ_CHAT_MODELS, GROQ_TRANSCRIPTION_MODELS } from './model-meta'
export type {
GroqTextMetadata,
GroqImageMetadata,
Expand Down
13 changes: 13 additions & 0 deletions packages/ai-groq/src/model-meta.ts
Original file line number Diff line number Diff line change
Expand Up @@ -402,3 +402,16 @@ export type ResolveInputModalities<TModel extends string> =
TModel extends keyof GroqModelInputModalitiesByName
? GroqModelInputModalitiesByName[TModel]
: readonly ['text']

/**
* All supported Groq transcription model identifiers.
*/
export const GROQ_TRANSCRIPTION_MODELS = [
'whisper-large-v3-turbo',
'whisper-large-v3',
] as const

/**
* Union type of all supported Groq transcription model names.
*/
export type GroqTranscriptionModel = (typeof GROQ_TRANSCRIPTION_MODELS)[number]
2 changes: 1 addition & 1 deletion testing/e2e/src/lib/feature-support.ts
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ export const matrix: Record<Feature, Set<Provider>> = {
'audio-gen': new Set(['gemini', 'elevenlabs']),
'sound-effects': new Set(['elevenlabs']),
tts: new Set(['openai', 'grok', 'elevenlabs']),
transcription: new Set(['openai', 'grok', 'elevenlabs']),
transcription: new Set(['openai', 'grok', 'groq', 'elevenlabs']),
'video-gen': new Set(['openai']),
// Only Gemini currently surfaces a first-class stateful conversation API via
// the adapter (geminiTextInteractions, behind @tanstack/ai-gemini/experimental).
Expand Down
6 changes: 6 additions & 0 deletions testing/e2e/src/lib/media-providers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import {
createGrokSpeech,
createGrokTranscription,
} from '@tanstack/ai-grok'
import { createGroqTranscription } from '@tanstack/ai-groq'
import {
createElevenLabsAudio,
createElevenLabsSpeech,
Expand Down Expand Up @@ -109,6 +110,11 @@ export function createTranscriptionAdapter(
baseURL: openaiUrl(aimockPort),
defaultHeaders: headers,
}),
groq: () =>
createGroqTranscription('whisper-large-v3-turbo', DUMMY_KEY, {
baseURL: openaiUrl(aimockPort),
defaultHeaders: headers,
}),
elevenlabs: () =>
createElevenLabsTranscription('scribe_v1', DUMMY_KEY, {
baseUrl: llmockBase(aimockPort),
Expand Down