From 982dfe01620261e6b0faee897050e878abcbd7e3 Mon Sep 17 00:00:00 2001 From: hawi-tensei Date: Fri, 29 May 2026 01:34:56 +0200 Subject: [PATCH] feat(web): add objective Overall/Value/Capability score columns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds three sortable, transparently-computed score columns to the models table, plus a dynamic rank (#) column that renumbers with the current sort. Scores are derived entirely from existing objective catalog fields (cost, context window, output limit, capability flags, modality breadth, release date) — no benchmarks or hand-grading. Four normalized 0-100 components (capability, cost-efficiency, context, recency) are blended into three lenses with weights documented in one place in score.ts: - Overall: well-rounded "best overall" - Value: cost-efficiency weighted (cheap-yet-capable) - Capability: feature/modality breadth weighted The table defaults to Overall (descending). All three columns sort like any other column, so users can pick the lens that fits their use case. Scope is web-only; the canonical api.json data is unchanged. --- packages/web/src/index.css | 80 +++++++++++++++++---- packages/web/src/index.ts | 3 + packages/web/src/render.tsx | 39 ++++++++++- packages/web/src/score.ts | 134 ++++++++++++++++++++++++++++++++++++ packages/web/src/shared.ts | 19 +++++ 5 files changed, 259 insertions(+), 16 deletions(-) create mode 100644 packages/web/src/score.ts diff --git a/packages/web/src/index.css b/packages/web/src/index.css index 3dd25505e..5ed41f7c8 100644 --- a/packages/web/src/index.css +++ b/packages/web/src/index.css @@ -274,53 +274,103 @@ td { height: 48px; } +th.rank-col { + text-align: right; + font-weight: normal; + color: var(--color-text-tertiary); +} + +#models-table th:first-child, +#models-table td:first-child { + min-width: 2.75rem; +} + tbody { td { color: var(--color-text-tertiary); } - td:nth-child(1) { + td:nth-child(2) { font-weight: 500; } - td:nth-child(1), td:nth-child(2), + td:nth-child(3), + td:nth-child(4), td:nth-child(5), td:nth-child(6), td:nth-child(9), td:nth-child(10), - td:nth-child(11), - td:nth-child(12), td:nth-child(13), td:nth-child(14), td:nth-child(15), - td:nth-child(16) { + td:nth-child(16), + td:nth-child(17), + td:nth-child(18), + td:nth-child(19), + td:nth-child(20) { color: var(--color-text); } - td:nth-child(5), - td:nth-child(6), - td:nth-child(18) { + td:nth-child(9), + td:nth-child(10), + td:nth-child(22) { font-size: 0.8125rem; font-family: var(--font-mono); text-transform: uppercase; } - td:nth-child(3), - td:nth-child(4), - td:nth-child(9), - td:nth-child(10), - td:nth-child(11), - td:nth-child(12), + td:nth-child(7), + td:nth-child(8), td:nth-child(13), td:nth-child(14), td:nth-child(15), td:nth-child(16), - td:nth-child(17) { + td:nth-child(17), + td:nth-child(18), + td:nth-child(19), + td:nth-child(20), + td:nth-child(21) { font-size: 0.8125rem; font-family: var(--font-mono); } + td.rank { + font-family: var(--font-mono); + font-size: 0.8125rem; + color: var(--color-text-tertiary); + text-align: right; + font-variant-numeric: tabular-nums; + } + + .score { + position: relative; + display: flex; + align-items: center; + min-width: 3rem; + } + + .score::before { + content: ""; + position: absolute; + left: 0; + top: 50%; + transform: translateY(-50%); + height: 0.875rem; + width: calc(var(--score) * 1%); + background: color-mix(in srgb, var(--color-brand) 22%, transparent); + border-radius: 2px; + z-index: 0; + } + + .score-value { + position: relative; + z-index: 1; + font-family: var(--font-mono); + font-size: 0.8125rem; + font-variant-numeric: tabular-nums; + } + .provider-cell { display: flex; align-items: center; diff --git a/packages/web/src/index.ts b/packages/web/src/index.ts index 29beab273..e8daa7b6b 100644 --- a/packages/web/src/index.ts +++ b/packages/web/src/index.ts @@ -131,6 +131,9 @@ function prepareRow(row: TableRow): VirtualizedRow { const sortValues: VirtualizedRow["sortValues"] = [ row.providerName, row.modelName, + row.overallScore, + row.valueScore, + row.capabilityScore, row.family, row.providerId, row.modelId, diff --git a/packages/web/src/render.tsx b/packages/web/src/render.tsx index cd0106f08..265fb82f3 100644 --- a/packages/web/src/render.tsx +++ b/packages/web/src/render.tsx @@ -7,6 +7,7 @@ import { renderToString } from "hono/jsx/dom/server"; import { existsSync } from "fs"; import path from "path"; import { type TableRow, renderRow, getLargestRow } from "./shared.js"; +import { annotateScores } from "./score.js"; export const Providers = await generate( path.join(import.meta.dir, "..", "..", "..", "providers") @@ -64,7 +65,7 @@ for (const [providerId] of Object.entries(Providers)) { export const INITIAL_ROW_COUNT = 50; -export const TableRows: TableRow[] = Object.entries(Providers) +const RawRows = Object.entries(Providers) .sort(([, providerA], [, providerB]) => providerA.name.localeCompare(providerB.name) ) @@ -102,6 +103,11 @@ export const TableRows: TableRow[] = Object.entries(Providers) })) ); +// Attach objective scores, then default the table to a "best overall" ranking. +export const TableRows: TableRow[] = annotateScores(RawRows).sort( + (a, b) => b.overallScore - a.overallScore +); + const largestRow = getLargestRow(TableRows); export const Rendered = renderToString( @@ -142,12 +148,43 @@ export const Rendered = renderToString( + + + + diff --git a/packages/web/src/score.ts b/packages/web/src/score.ts new file mode 100644 index 000000000..40f8e9e8d --- /dev/null +++ b/packages/web/src/score.ts @@ -0,0 +1,134 @@ +import type { TableRow } from "./shared.js"; + +/** + * Objective model scoring. + * + * Every input is a factual field already in the catalog (cost, context window, + * output limit, capability flags, modality breadth, release date). Nothing is + * benchmarked or hand-graded. We turn those raw fields into three transparent, + * normalized 0-100 indices so the table can be ranked from different angles: + * + * - capability : what the model can do (capability flags + modality breadth) + * - cost : price efficiency (cheaper -> higher; free -> top) + * - context : context window + output limit (log-scaled) + * - recency : how recently it was released + * + * Each composite below is just a weighted blend of those four components. The + * weights are the only opinion in the file and are intentionally kept here, in + * one place, so they're easy to audit or change. + */ +const WEIGHTS = { + // Well-rounded "best overall". + overall: { capability: 0.4, cost: 0.3, context: 0.2, recency: 0.1 }, + // Cheap-yet-capable wins. + value: { capability: 0.35, cost: 0.5, context: 0.1, recency: 0.05 }, + // What the model can do dominates; price is a minor tiebreaker. + capability: { capability: 0.6, cost: 0.15, context: 0.2, recency: 0.05 }, +} as const; + +/** Rows before scores are attached. */ +type ScorableRow = Omit< + TableRow, + "overallScore" | "valueScore" | "capabilityScore" +>; + +const NEUTRAL = 50; + +/** + * Returns a function that maps a raw value to 0-100 via min-max over the + * dataset. Non-finite inputs (or a flat dataset) collapse to a neutral 50 so a + * missing field never silently wins or loses. + */ +function normalizer(values: number[]): (value: number) => number { + const finite = values.filter((value) => Number.isFinite(value)); + const min = finite.length ? Math.min(...finite) : 0; + const max = finite.length ? Math.max(...finite) : 0; + const span = max - min; + return (value: number) => { + if (!Number.isFinite(value) || span === 0) return NEUTRAL; + return ((value - min) / span) * 100; + }; +} + +/** Capability flags + how many input/output modalities are supported. */ +function capabilityRaw(row: ScorableRow): number { + const flags = + (row.toolCall ? 1 : 0) + + (row.reasoning ? 1 : 0) + + (row.structuredOutput ? 1 : 0) + + (row.temperature ? 1 : 0); + return flags + row.input.length + row.output.length; +} + +/** Context window + output limit, log-scaled (they span orders of magnitude). */ +function contextRaw(row: ScorableRow): number { + return ( + Math.log10((row.contextLimit || 0) + 1) + + 0.5 * Math.log10((row.outputLimit || 0) + 1) + ); +} + +/** + * Blended price per 1M tokens (input + output). Returns NaN when no pricing is + * published so the model lands on a neutral cost score rather than a free pass. + */ +function blendedCost(row: ScorableRow): number { + const parts = [row.inputCost, row.outputCost].filter( + (cost): cost is number => cost !== undefined, + ); + if (parts.length === 0) return NaN; + return parts.reduce((sum, cost) => sum + cost, 0); +} + +/** Release date as an epoch (ms); newer is higher. NaN when unparseable. */ +function recencyRaw(row: ScorableRow): number { + return Date.parse(row.releaseDate); +} + +function round(value: number): number { + return Math.round(value * 10) / 10; +} + +/** + * Computes the three composite scores for every row and returns new rows with + * `overallScore`, `valueScore` and `capabilityScore` attached. Normalization is + * over the whole set, so scores are relative to the rest of the catalog. + */ +export function annotateScores(rows: ScorableRow[]): TableRow[] { + const capNorm = normalizer(rows.map(capabilityRaw)); + const ctxNorm = normalizer(rows.map(contextRaw)); + const recNorm = normalizer(rows.map(recencyRaw)); + // Cost is log-scaled then inverted: lower price -> higher score. + const costNorm = normalizer( + rows.map((row) => { + const cost = blendedCost(row); + return Number.isNaN(cost) ? NaN : Math.log10(cost + 0.01); + }), + ); + + return rows.map((row) => { + const capability = capNorm(capabilityRaw(row)); + const context = ctxNorm(contextRaw(row)); + const recency = recNorm(recencyRaw(row)); + const cost = blendedCost(row); + // Invert cost: cheapest model in the set scores highest. Unknown -> neutral. + const costScore = Number.isNaN(cost) + ? NEUTRAL + : 100 - costNorm(Math.log10(cost + 0.01)); + + const blend = (w: (typeof WEIGHTS)[keyof typeof WEIGHTS]) => + round( + capability * w.capability + + costScore * w.cost + + context * w.context + + recency * w.recency, + ); + + return { + ...row, + overallScore: blend(WEIGHTS.overall), + valueScore: blend(WEIGHTS.value), + capabilityScore: blend(WEIGHTS.capability), + }; + }); +} diff --git a/packages/web/src/shared.ts b/packages/web/src/shared.ts index 153705937..188712be1 100644 --- a/packages/web/src/shared.ts +++ b/packages/web/src/shared.ts @@ -5,6 +5,9 @@ export interface TableRow { modelId: string; modelName: string; family?: string; + overallScore: number; + valueScore: number; + capabilityScore: number; toolCall: boolean; reasoning: boolean; input: string[]; @@ -78,6 +81,17 @@ export function weightsText(value: boolean) { return value ? "Open" : "Closed"; } +export function rankText(index: number) { + return index >= 0 ? String(index + 1) : ""; +} + +export function renderScore(value: number) { + const pct = Math.max(0, Math.min(100, value)); + return `
${value.toFixed( + 1 + )}
`; +} + export function renderModalityIcon(modality: string) { const label = modality === "pdf" @@ -101,10 +115,14 @@ export function renderCopyButton(modelId: string) { export function renderRow(row: TableRow, index: number) { return ` + + + +
# Provider Model +
+ + Overall +
+ score /100 +
+ +
+
+
+ + Value +
+ score /100 +
+ +
+
+
+ + Capability +
+ score /100 +
+ +
+
Family
${rankText(index)}
${row.providerLogoSvg}${escapeHtml( row.providerName )}
${escapeHtml(row.modelName)}${renderScore(row.overallScore)}${renderScore(row.valueScore)}${renderScore(row.capabilityScore)} ${escapeHtml(row.family ?? "-")} ${escapeHtml(row.providerId)}
${escapeHtml( @@ -136,6 +154,7 @@ export function renderRow(row: TableRow, index: number) { export function getLargestRow(rows: TableRow[]): TableRow { const worst: TableRow = { providerId: "", providerName: "", providerLogoSvg: "", modelId: "", modelName: "", + overallScore: 100, valueScore: 100, capabilityScore: 100, toolCall: true, reasoning: true, input: [], output: [], contextLimit: 0, outputLimit: 0,