pydantic · JonathanTsen · May 23, 2026 · May 23, 2026 · May 23, 2026 · cubic-dev-ai
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Release Notes
 
+## [unreleased]
+
+Integrations:
+
+* Capture Gemini `cache_read`, `thoughts` and `tool_use_prompt` tokens in `instrument_google_genai`; compute `operation.cost` via `genai-prices` when available.
+
 ## [v4.33.0] (2026-05-13)
 
 CLI:

diff --git a/docs/integrations/llms/google-genai.md b/docs/integrations/llms/google-genai.md
@@ -45,3 +45,21 @@ This creates a span which shows the conversation in the Logfire UI:
     to `true`, the spans will simply contain `<elided>` where the prompts and completions would be.
 
 [`logfire.instrument_google_genai()`][logfire.Logfire.instrument_google_genai] uses the `GoogleGenAiSdkInstrumentor().instrument()` method of the [`opentelemetry-instrumentation-google-genai`](https://pypi.org/project/opentelemetry-instrumentation-google-genai/) package.
+
+## Token usage details
+
+When a span captures a Gemini call via `logfire.instrument_google_genai()`, the
+following attributes may appear depending on the response:
+
+- `gen_ai.usage.input_tokens` — total prompt tokens (already includes cached, see below)
+- `gen_ai.usage.output_tokens` — completion tokens
+- `gen_ai.usage.cache_read.input_tokens` — tokens served from [context cache](https://ai.google.dev/gemini-api/docs/caching) (cache hit)
+- `gen_ai.usage.details.thoughts_tokens` — [reasoning tokens](https://ai.google.dev/gemini-api/docs/thinking) (Gemini 2.5 / 3.x)
+- `gen_ai.usage.details.tool_use_prompt_tokens` — tokens used for [tool definitions](https://ai.google.dev/gemini-api/docs/function-calling)
+- `operation.cost` — calculated price in USD using the [official Gemini pricing tables](https://ai.google.dev/gemini-api/docs/pricing) via [`genai-prices`](https://pypi.org/project/genai-prices/)
- `operation.cost` — calculated price in USD using the [official Gemini pricing tables](https://ai.google.dev/gemini-api/docs/pricing) via [`genai-prices`](https://pypi.org/project/genai-prices/)
+- `operation.cost` — calculated price in USD using the [official Gemini pricing tables](https://ai.google.dev/gemini-api/docs/pricing) via [`genai-prices`](https://pypi.org/project/genai-prices/) (only present when the package is installed and model pricing is known)
- `operation.cost` — calculated price in USD using the [official Gemini pricing tables](https://ai.google.dev/gemini-api/docs/pricing) via [`genai-prices`](https://pypi.org/project/genai-prices/)
+- `operation.cost` — calculated price in USD using the [official Gemini pricing tables](https://ai.google.dev/gemini-api/docs/pricing) via [`genai-prices`](https://pypi.org/project/genai-prices/) (only present when the package is installed and model pricing is known)
+
+Note that, unlike Anthropic, the Gemini API's `prompt_token_count` already includes
+the cached tokens; Logfire does not sum them again. This is documented in the
+[`GenerateContentResponseUsageMetadata.prompt_token_count`](https://googleapis.github.io/python-genai/genai.html#genai.types.GenerateContentResponseUsageMetadata.prompt_token_count)
+field description: *"When `cached_content` is set, this also includes the number
+of tokens in the cached content."*
diff --git a/logfire/_internal/integrations/google_genai.py b/logfire/_internal/integrations/google_genai.py
@@ -59,6 +59,68 @@ def wrapped_to_dict(obj: object) -> object:
     pass
 
 
+try:
+    from opentelemetry.instrumentation.google_genai import generate_content as _gc_module
+
+    _Helper = _gc_module._GenerateContentInstrumentationHelper  # pyright: ignore[reportPrivateUsage]
+    _original_maybe_update = _Helper._maybe_update_token_counts  # pyright: ignore[reportPrivateUsage]
+    _original_create_final = _Helper.create_final_attributes
+
+    def _wrapped_maybe_update_token_counts(self: Any, response: Any) -> None:
+        _original_maybe_update(self, response)
+        try:
+            metadata = getattr(response, 'usage_metadata', None)
+            if metadata is None:
+                return
+            # "keep last non-zero" — streaming sends partial chunks; cached/thoughts/tool_use
+            # counts typically only appear in the final chunk.
+            if cached := getattr(metadata, 'cached_content_token_count', None):
+                self._lf_cache_read = cached
+            if thoughts := getattr(metadata, 'thoughts_token_count', None):
+                self._lf_thoughts = thoughts
+            if tool_use := getattr(metadata, 'tool_use_prompt_token_count', None):
+                self._lf_tool_use_prompt = tool_use
+            self._lf_response = response
+        except Exception:  # pragma: no cover
+            pass
+
+    def _wrapped_create_final_attributes(self: Any) -> dict[str, Any]:
+        attrs = _original_create_final(self)
+        try:
+            if cached := getattr(self, '_lf_cache_read', None):
+                attrs['gen_ai.usage.cache_read.input_tokens'] = cached
+            if thoughts := getattr(self, '_lf_thoughts', None):
+                attrs['gen_ai.usage.details.thoughts_tokens'] = thoughts
+            if tool_use := getattr(self, '_lf_tool_use_prompt', None):
+                attrs['gen_ai.usage.details.tool_use_prompt_tokens'] = tool_use
+            response = getattr(self, '_lf_response', None)
+            if response is not None:
+                try:
+                    from genai_prices import calc_price, extract_usage
+
+                    # genai_prices expects the camelCase JSON keys ('usageMetadata', 'modelVersion');
+                    # google-genai pydantic models use snake_case fields with camelCase aliases.
+                    usage_data = extract_usage(response.model_dump(by_alias=True), provider_id='google')
+                    if usage_data.model is not None:  # pragma: no branch
+                        attrs['operation.cost'] = float(
+                            calc_price(
+                                usage_data.usage,
+                                model_ref=usage_data.model.id,
+                                provider_id='google',
+                            ).total_price
+                        )
+                except Exception:
+                    pass
+        except Exception:  # pragma: no cover
+            pass
+        return attrs
+
+    _Helper._maybe_update_token_counts = _wrapped_maybe_update_token_counts  # pyright: ignore[reportPrivateUsage]
+    _Helper.create_final_attributes = _wrapped_create_final_attributes
+except Exception:  # pragma: no cover
+    pass
+
+
 Part: TypeAlias = 'dict[str, Any] | str'
 
 

diff --git a/tests/otel_integrations/test_google_genai.py b/tests/otel_integrations/test_google_genai.py
@@ -1,5 +1,6 @@
 import os
 import warnings
+from typing import Any
 from unittest import mock
 from unittest.mock import patch
 
@@ -108,6 +109,7 @@ def get_current_weather(location: str) -> str:
                     'gen_ai.usage.input_tokens': 58,
                     'gen_ai.usage.output_tokens': 9,
                     'gen_ai.response.finish_reasons': ('stop',),
+                    'operation.cost': 9.4e-06,
                     'logfire.metrics': IsPartialDict(),
                     'events': [
                         {'content': 'help', 'role': 'system'},
@@ -200,6 +202,7 @@ def get_current_weather(location: str) -> str:
                     'gen_ai.usage.input_tokens': 39,
                     'gen_ai.usage.output_tokens': 7,
                     'gen_ai.response.finish_reasons': ('stop',),
+                    'operation.cost': 6.7e-06,
                     'logfire.metrics': IsPartialDict(),
                     'events': [
                         {'content': '<elided>', 'role': 'user'},
@@ -258,6 +261,8 @@ class ResponseData(pydantic.BaseModel):
                     'gen_ai.usage.input_tokens': 2,
                     'gen_ai.usage.output_tokens': 13,
                     'gen_ai.response.finish_reasons': ('stop',),
+                    'gen_ai.usage.details.thoughts_tokens': 58,
+                    'operation.cost': 0.0001781,
                     'logfire.metrics': IsPartialDict(),
                     'events': [
                         {'content': 'Hi', 'role': 'user'},
@@ -278,6 +283,189 @@ class ResponseData(pydantic.BaseModel):
     )
 
 
+def _stub_generate_content(response: Any) -> Any:
+    def _generate(self: Any, **kwargs: Any) -> Any:
+        return response
+
+    return _generate
+
+
+def _build_fake_genai_response(
+    *,
+    model_version: str = 'gemini-2.5-flash',
+    prompt_token_count: int = 1000,
+    candidates_token_count: int = 200,
+    cached_content_token_count: int | None = None,
+    thoughts_token_count: int | None = None,
+    tool_use_prompt_token_count: int | None = None,
+):
+    from google.genai.types import (
+        Candidate,
+        Content,
+        FinishReason,
+        GenerateContentResponse,
+        GenerateContentResponseUsageMetadata,
+        Part,
+    )
+
+    return GenerateContentResponse(
+        model_version=model_version,
+        usage_metadata=GenerateContentResponseUsageMetadata(
+            prompt_token_count=prompt_token_count,
+            candidates_token_count=candidates_token_count,
+            cached_content_token_count=cached_content_token_count,
+            thoughts_token_count=thoughts_token_count,
+            tool_use_prompt_token_count=tool_use_prompt_token_count,
+            total_token_count=(prompt_token_count or 0) + (candidates_token_count or 0),
+        ),
+        candidates=[
+            Candidate(
+                content=Content(parts=[Part.from_text(text='hi back')], role='model'),
+                finish_reason=FinishReason.STOP,
+            )
+        ],
+    )
+
+
+@pytest.fixture
+def reset_google_genai_instrumentation():
+    """Force re-instrumentation so monkeypatched `Models.generate_content` is captured.
+
+    The upstream `_MethodsSnapshot` captures `Models.generate_content` at instrument
+    time. The instrumentor is a process-wide singleton with an
+    `is_instrumented_by_opentelemetry` flag that gates re-instrumentation. We clear
+    the flag (the proper `uninstrument()` call asserts on a snapshot that the
+    upstream `__init__` resets to None on every `GoogleGenAiSdkInstrumentor()` call,
+    which makes it unreliable in a test suite) so the next `instrument()` call
+    re-creates the snapshot and picks up the mock.
+    """
+    from opentelemetry.instrumentation.google_genai import GoogleGenAiSdkInstrumentor
+
+    instrumentor = GoogleGenAiSdkInstrumentor()
+    instrumentor._is_instrumented_by_opentelemetry = False  # pyright: ignore[reportPrivateUsage]
+    yield
+    instrumentor._is_instrumented_by_opentelemetry = False  # pyright: ignore[reportPrivateUsage]
+
+
+def test_instrument_google_genai_cache_and_thinking_tokens(
+    exporter: TestExporter, monkeypatch: pytest.MonkeyPatch, reset_google_genai_instrumentation: None
+) -> None:
+    from google.genai import Client
+    from google.genai.models import Models
+
+    fake_response = _build_fake_genai_response(
+        prompt_token_count=1000,
+        candidates_token_count=200,
+        cached_content_token_count=750,
+        thoughts_token_count=80,
+        tool_use_prompt_token_count=30,
+    )
+    monkeypatch.setattr(Models, 'generate_content', _stub_generate_content(fake_response))
+
+    logfire.instrument_google_genai()
+
+    client = Client(api_key='fake')
+    client.models.generate_content(model='gemini-2.5-flash', contents='hi')  # type: ignore
+
+    [span] = exporter.exported_spans_as_dict(parse_json_attributes=True)
+    attrs = span['attributes']
+    assert attrs['gen_ai.usage.input_tokens'] == 1000
+    assert attrs['gen_ai.usage.output_tokens'] == 200
+    assert attrs['gen_ai.usage.cache_read.input_tokens'] == 750
+    assert attrs['gen_ai.usage.details.thoughts_tokens'] == 80
+    assert attrs['gen_ai.usage.details.tool_use_prompt_tokens'] == 30
+    # operation.cost depends on the current Gemini 2.5 Flash pricing table in
+    # genai_prices; just confirm it was computed and is a sensible positive value.
+    assert isinstance(attrs['operation.cost'], float)
+    assert attrs['operation.cost'] > 0
+
+
+def test_instrument_google_genai_no_cache_metadata(
+    exporter: TestExporter, monkeypatch: pytest.MonkeyPatch, reset_google_genai_instrumentation: None
+) -> None:
+    from google.genai import Client
+    from google.genai.models import Models
+
+    fake_response = _build_fake_genai_response(
+        prompt_token_count=58,
+        candidates_token_count=9,
+    )
+    monkeypatch.setattr(Models, 'generate_content', _stub_generate_content(fake_response))
+
+    logfire.instrument_google_genai()
+
+    client = Client(api_key='fake')
+    client.models.generate_content(model='gemini-2.5-flash', contents='hi')  # type: ignore
+
+    [span] = exporter.exported_spans_as_dict(parse_json_attributes=True)
+    attrs = span['attributes']
+    assert 'gen_ai.usage.cache_read.input_tokens' not in attrs
+    assert 'gen_ai.usage.details.thoughts_tokens' not in attrs
+    assert 'gen_ai.usage.details.tool_use_prompt_tokens' not in attrs
+    assert attrs['gen_ai.usage.input_tokens'] == 58
+    assert attrs['gen_ai.usage.output_tokens'] == 9
+
+
+def test_instrument_google_genai_no_usage_metadata(
+    exporter: TestExporter, monkeypatch: pytest.MonkeyPatch, reset_google_genai_instrumentation: None
+) -> None:
+    """Response missing `usage_metadata` entirely: no extra attrs and no cost computation."""
+    from google.genai import Client
+    from google.genai.models import Models
+    from google.genai.types import Candidate, Content, FinishReason, GenerateContentResponse, Part
+
+    fake_response = GenerateContentResponse(
+        model_version='gemini-2.5-flash',
+        usage_metadata=None,
+        candidates=[
+            Candidate(
+                content=Content(parts=[Part.from_text(text='hi')], role='model'),
+                finish_reason=FinishReason.STOP,
+            )
+        ],
+    )
+    monkeypatch.setattr(Models, 'generate_content', _stub_generate_content(fake_response))
+
+    logfire.instrument_google_genai()
+
+    client = Client(api_key='fake')
+    client.models.generate_content(model='gemini-2.5-flash', contents='hi')  # type: ignore
+
+    [span] = exporter.exported_spans_as_dict(parse_json_attributes=True)
+    attrs = span['attributes']
+    assert 'gen_ai.usage.cache_read.input_tokens' not in attrs
+    assert 'gen_ai.usage.details.thoughts_tokens' not in attrs
+    assert 'gen_ai.usage.details.tool_use_prompt_tokens' not in attrs
+    assert 'operation.cost' not in attrs
+
+
+def test_instrument_google_genai_cost_silent_failure(
+    exporter: TestExporter, monkeypatch: pytest.MonkeyPatch, reset_google_genai_instrumentation: None
+) -> None:
+    from google.genai import Client
+    from google.genai.models import Models
+
+    fake_response = _build_fake_genai_response(
+        model_version='gemini-unknown-999',
+        prompt_token_count=1000,
+        candidates_token_count=200,
+        cached_content_token_count=750,
+        thoughts_token_count=80,
+    )
+    monkeypatch.setattr(Models, 'generate_content', _stub_generate_content(fake_response))
+
+    logfire.instrument_google_genai()
+
+    client = Client(api_key='fake')
+    client.models.generate_content(model='gemini-unknown-999', contents='hi')  # type: ignore
+
+    [span] = exporter.exported_spans_as_dict(parse_json_attributes=True)
+    attrs = span['attributes']
+    assert 'operation.cost' not in attrs
+    assert attrs['gen_ai.usage.cache_read.input_tokens'] == 750
+    assert attrs['gen_ai.usage.details.thoughts_tokens'] == 80
+
+
 def test_span_event_logger_with_none_parts(exporter: TestExporter) -> None:
     """Test that SpanEventLogger handles parts=None gracefully.