diff --git a/CHANGELOG.md b/CHANGELOG.md index c45a1410e..f04c29195 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Release Notes +## [unreleased] + +Integrations: + +* Capture Gemini `cache_read`, `thoughts` and `tool_use_prompt` tokens in `instrument_google_genai`; compute `operation.cost` via `genai-prices` when available. + ## [v4.33.0] (2026-05-13) CLI: diff --git a/docs/integrations/llms/google-genai.md b/docs/integrations/llms/google-genai.md index 49720f614..277c78424 100644 --- a/docs/integrations/llms/google-genai.md +++ b/docs/integrations/llms/google-genai.md @@ -45,3 +45,21 @@ This creates a span which shows the conversation in the Logfire UI: to `true`, the spans will simply contain `` where the prompts and completions would be. [`logfire.instrument_google_genai()`][logfire.Logfire.instrument_google_genai] uses the `GoogleGenAiSdkInstrumentor().instrument()` method of the [`opentelemetry-instrumentation-google-genai`](https://pypi.org/project/opentelemetry-instrumentation-google-genai/) package. + +## Token usage details + +When a span captures a Gemini call via `logfire.instrument_google_genai()`, the +following attributes may appear depending on the response: + +- `gen_ai.usage.input_tokens` — total prompt tokens (already includes cached, see below) +- `gen_ai.usage.output_tokens` — completion tokens +- `gen_ai.usage.cache_read.input_tokens` — tokens served from [context cache](https://ai.google.dev/gemini-api/docs/caching) (cache hit) +- `gen_ai.usage.details.thoughts_tokens` — [reasoning tokens](https://ai.google.dev/gemini-api/docs/thinking) (Gemini 2.5 / 3.x) +- `gen_ai.usage.details.tool_use_prompt_tokens` — tokens used for [tool definitions](https://ai.google.dev/gemini-api/docs/function-calling) +- `operation.cost` — calculated price in USD using the [official Gemini pricing tables](https://ai.google.dev/gemini-api/docs/pricing) via [`genai-prices`](https://pypi.org/project/genai-prices/) + +Note that, unlike Anthropic, the Gemini API's `prompt_token_count` already includes +the cached tokens; Logfire does not sum them again. This is documented in the +[`GenerateContentResponseUsageMetadata.prompt_token_count`](https://googleapis.github.io/python-genai/genai.html#genai.types.GenerateContentResponseUsageMetadata.prompt_token_count) +field description: *"When `cached_content` is set, this also includes the number +of tokens in the cached content."* diff --git a/logfire/_internal/integrations/google_genai.py b/logfire/_internal/integrations/google_genai.py index 33998b520..88b42e7f8 100644 --- a/logfire/_internal/integrations/google_genai.py +++ b/logfire/_internal/integrations/google_genai.py @@ -59,6 +59,68 @@ def wrapped_to_dict(obj: object) -> object: pass +try: + from opentelemetry.instrumentation.google_genai import generate_content as _gc_module + + _Helper = _gc_module._GenerateContentInstrumentationHelper # pyright: ignore[reportPrivateUsage] + _original_maybe_update = _Helper._maybe_update_token_counts # pyright: ignore[reportPrivateUsage] + _original_create_final = _Helper.create_final_attributes + + def _wrapped_maybe_update_token_counts(self: Any, response: Any) -> None: + _original_maybe_update(self, response) + try: + metadata = getattr(response, 'usage_metadata', None) + if metadata is None: + return + # "keep last non-zero" — streaming sends partial chunks; cached/thoughts/tool_use + # counts typically only appear in the final chunk. + if cached := getattr(metadata, 'cached_content_token_count', None): + self._lf_cache_read = cached + if thoughts := getattr(metadata, 'thoughts_token_count', None): + self._lf_thoughts = thoughts + if tool_use := getattr(metadata, 'tool_use_prompt_token_count', None): + self._lf_tool_use_prompt = tool_use + self._lf_response = response + except Exception: # pragma: no cover + pass + + def _wrapped_create_final_attributes(self: Any) -> dict[str, Any]: + attrs = _original_create_final(self) + try: + if cached := getattr(self, '_lf_cache_read', None): + attrs['gen_ai.usage.cache_read.input_tokens'] = cached + if thoughts := getattr(self, '_lf_thoughts', None): + attrs['gen_ai.usage.details.thoughts_tokens'] = thoughts + if tool_use := getattr(self, '_lf_tool_use_prompt', None): + attrs['gen_ai.usage.details.tool_use_prompt_tokens'] = tool_use + response = getattr(self, '_lf_response', None) + if response is not None: + try: + from genai_prices import calc_price, extract_usage + + # genai_prices expects the camelCase JSON keys ('usageMetadata', 'modelVersion'); + # google-genai pydantic models use snake_case fields with camelCase aliases. + usage_data = extract_usage(response.model_dump(by_alias=True), provider_id='google') + if usage_data.model is not None: # pragma: no branch + attrs['operation.cost'] = float( + calc_price( + usage_data.usage, + model_ref=usage_data.model.id, + provider_id='google', + ).total_price + ) + except Exception: + pass + except Exception: # pragma: no cover + pass + return attrs + + _Helper._maybe_update_token_counts = _wrapped_maybe_update_token_counts # pyright: ignore[reportPrivateUsage] + _Helper.create_final_attributes = _wrapped_create_final_attributes +except Exception: # pragma: no cover + pass + + Part: TypeAlias = 'dict[str, Any] | str' diff --git a/tests/otel_integrations/test_google_genai.py b/tests/otel_integrations/test_google_genai.py index 7b0c1bc08..ae9a8a3f9 100644 --- a/tests/otel_integrations/test_google_genai.py +++ b/tests/otel_integrations/test_google_genai.py @@ -1,5 +1,6 @@ import os import warnings +from typing import Any from unittest import mock from unittest.mock import patch @@ -108,6 +109,7 @@ def get_current_weather(location: str) -> str: 'gen_ai.usage.input_tokens': 58, 'gen_ai.usage.output_tokens': 9, 'gen_ai.response.finish_reasons': ('stop',), + 'operation.cost': 9.4e-06, 'logfire.metrics': IsPartialDict(), 'events': [ {'content': 'help', 'role': 'system'}, @@ -200,6 +202,7 @@ def get_current_weather(location: str) -> str: 'gen_ai.usage.input_tokens': 39, 'gen_ai.usage.output_tokens': 7, 'gen_ai.response.finish_reasons': ('stop',), + 'operation.cost': 6.7e-06, 'logfire.metrics': IsPartialDict(), 'events': [ {'content': '', 'role': 'user'}, @@ -258,6 +261,8 @@ class ResponseData(pydantic.BaseModel): 'gen_ai.usage.input_tokens': 2, 'gen_ai.usage.output_tokens': 13, 'gen_ai.response.finish_reasons': ('stop',), + 'gen_ai.usage.details.thoughts_tokens': 58, + 'operation.cost': 0.0001781, 'logfire.metrics': IsPartialDict(), 'events': [ {'content': 'Hi', 'role': 'user'}, @@ -278,6 +283,189 @@ class ResponseData(pydantic.BaseModel): ) +def _stub_generate_content(response: Any) -> Any: + def _generate(self: Any, **kwargs: Any) -> Any: + return response + + return _generate + + +def _build_fake_genai_response( + *, + model_version: str = 'gemini-2.5-flash', + prompt_token_count: int = 1000, + candidates_token_count: int = 200, + cached_content_token_count: int | None = None, + thoughts_token_count: int | None = None, + tool_use_prompt_token_count: int | None = None, +): + from google.genai.types import ( + Candidate, + Content, + FinishReason, + GenerateContentResponse, + GenerateContentResponseUsageMetadata, + Part, + ) + + return GenerateContentResponse( + model_version=model_version, + usage_metadata=GenerateContentResponseUsageMetadata( + prompt_token_count=prompt_token_count, + candidates_token_count=candidates_token_count, + cached_content_token_count=cached_content_token_count, + thoughts_token_count=thoughts_token_count, + tool_use_prompt_token_count=tool_use_prompt_token_count, + total_token_count=(prompt_token_count or 0) + (candidates_token_count or 0), + ), + candidates=[ + Candidate( + content=Content(parts=[Part.from_text(text='hi back')], role='model'), + finish_reason=FinishReason.STOP, + ) + ], + ) + + +@pytest.fixture +def reset_google_genai_instrumentation(): + """Force re-instrumentation so monkeypatched `Models.generate_content` is captured. + + The upstream `_MethodsSnapshot` captures `Models.generate_content` at instrument + time. The instrumentor is a process-wide singleton with an + `is_instrumented_by_opentelemetry` flag that gates re-instrumentation. We clear + the flag (the proper `uninstrument()` call asserts on a snapshot that the + upstream `__init__` resets to None on every `GoogleGenAiSdkInstrumentor()` call, + which makes it unreliable in a test suite) so the next `instrument()` call + re-creates the snapshot and picks up the mock. + """ + from opentelemetry.instrumentation.google_genai import GoogleGenAiSdkInstrumentor + + instrumentor = GoogleGenAiSdkInstrumentor() + instrumentor._is_instrumented_by_opentelemetry = False # pyright: ignore[reportPrivateUsage] + yield + instrumentor._is_instrumented_by_opentelemetry = False # pyright: ignore[reportPrivateUsage] + + +def test_instrument_google_genai_cache_and_thinking_tokens( + exporter: TestExporter, monkeypatch: pytest.MonkeyPatch, reset_google_genai_instrumentation: None +) -> None: + from google.genai import Client + from google.genai.models import Models + + fake_response = _build_fake_genai_response( + prompt_token_count=1000, + candidates_token_count=200, + cached_content_token_count=750, + thoughts_token_count=80, + tool_use_prompt_token_count=30, + ) + monkeypatch.setattr(Models, 'generate_content', _stub_generate_content(fake_response)) + + logfire.instrument_google_genai() + + client = Client(api_key='fake') + client.models.generate_content(model='gemini-2.5-flash', contents='hi') # type: ignore + + [span] = exporter.exported_spans_as_dict(parse_json_attributes=True) + attrs = span['attributes'] + assert attrs['gen_ai.usage.input_tokens'] == 1000 + assert attrs['gen_ai.usage.output_tokens'] == 200 + assert attrs['gen_ai.usage.cache_read.input_tokens'] == 750 + assert attrs['gen_ai.usage.details.thoughts_tokens'] == 80 + assert attrs['gen_ai.usage.details.tool_use_prompt_tokens'] == 30 + # operation.cost depends on the current Gemini 2.5 Flash pricing table in + # genai_prices; just confirm it was computed and is a sensible positive value. + assert isinstance(attrs['operation.cost'], float) + assert attrs['operation.cost'] > 0 + + +def test_instrument_google_genai_no_cache_metadata( + exporter: TestExporter, monkeypatch: pytest.MonkeyPatch, reset_google_genai_instrumentation: None +) -> None: + from google.genai import Client + from google.genai.models import Models + + fake_response = _build_fake_genai_response( + prompt_token_count=58, + candidates_token_count=9, + ) + monkeypatch.setattr(Models, 'generate_content', _stub_generate_content(fake_response)) + + logfire.instrument_google_genai() + + client = Client(api_key='fake') + client.models.generate_content(model='gemini-2.5-flash', contents='hi') # type: ignore + + [span] = exporter.exported_spans_as_dict(parse_json_attributes=True) + attrs = span['attributes'] + assert 'gen_ai.usage.cache_read.input_tokens' not in attrs + assert 'gen_ai.usage.details.thoughts_tokens' not in attrs + assert 'gen_ai.usage.details.tool_use_prompt_tokens' not in attrs + assert attrs['gen_ai.usage.input_tokens'] == 58 + assert attrs['gen_ai.usage.output_tokens'] == 9 + + +def test_instrument_google_genai_no_usage_metadata( + exporter: TestExporter, monkeypatch: pytest.MonkeyPatch, reset_google_genai_instrumentation: None +) -> None: + """Response missing `usage_metadata` entirely: no extra attrs and no cost computation.""" + from google.genai import Client + from google.genai.models import Models + from google.genai.types import Candidate, Content, FinishReason, GenerateContentResponse, Part + + fake_response = GenerateContentResponse( + model_version='gemini-2.5-flash', + usage_metadata=None, + candidates=[ + Candidate( + content=Content(parts=[Part.from_text(text='hi')], role='model'), + finish_reason=FinishReason.STOP, + ) + ], + ) + monkeypatch.setattr(Models, 'generate_content', _stub_generate_content(fake_response)) + + logfire.instrument_google_genai() + + client = Client(api_key='fake') + client.models.generate_content(model='gemini-2.5-flash', contents='hi') # type: ignore + + [span] = exporter.exported_spans_as_dict(parse_json_attributes=True) + attrs = span['attributes'] + assert 'gen_ai.usage.cache_read.input_tokens' not in attrs + assert 'gen_ai.usage.details.thoughts_tokens' not in attrs + assert 'gen_ai.usage.details.tool_use_prompt_tokens' not in attrs + assert 'operation.cost' not in attrs + + +def test_instrument_google_genai_cost_silent_failure( + exporter: TestExporter, monkeypatch: pytest.MonkeyPatch, reset_google_genai_instrumentation: None +) -> None: + from google.genai import Client + from google.genai.models import Models + + fake_response = _build_fake_genai_response( + model_version='gemini-unknown-999', + prompt_token_count=1000, + candidates_token_count=200, + cached_content_token_count=750, + thoughts_token_count=80, + ) + monkeypatch.setattr(Models, 'generate_content', _stub_generate_content(fake_response)) + + logfire.instrument_google_genai() + + client = Client(api_key='fake') + client.models.generate_content(model='gemini-unknown-999', contents='hi') # type: ignore + + [span] = exporter.exported_spans_as_dict(parse_json_attributes=True) + attrs = span['attributes'] + assert 'operation.cost' not in attrs + assert attrs['gen_ai.usage.cache_read.input_tokens'] == 750 + assert attrs['gen_ai.usage.details.thoughts_tokens'] == 80 + + def test_span_event_logger_with_none_parts(exporter: TestExporter) -> None: """Test that SpanEventLogger handles parts=None gracefully.