From ac50a1127573bee511abced1d0d0d0974e21dcfa Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 22 Jun 2026 13:20:18 +0200 Subject: [PATCH 01/37] Add fields to GRPOConfig --- trl/trainer/grpo_config.py | 58 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py index 5736b6c0ddc..230c8f50c10 100644 --- a/trl/trainer/grpo_config.py +++ b/trl/trainer/grpo_config.py @@ -295,6 +295,28 @@ class GRPOConfig(_BaseConfig): position, improving results. Range: `[0.0-1.0]`. A value of `0.0` masks all but the highest entropy token; `1.0` keeps all tokens. The paper recommends a value of `0.2`. If used with `mask_truncated_completions=True`, only tokens from non-truncated completions are considered. + entropy_coef (`float`, *optional*, defaults to `0.0`): + Coefficient of the entropy regularization term in the loss. A positive value adds an entropy bonus that + encourages exploration by keeping the policy from collapsing to near-deterministic outputs. When + `use_adaptive_entropy=True`, this serves as the initial coefficient and is updated each optimizer step. + Has no effect when set to `0.0` (default). + use_adaptive_entropy (`bool`, *optional*, defaults to `False`): + Whether to use adaptive entropy control, introduced in + [Skywork-OR1](https://huggingface.co/papers/2505.22312). When enabled, the entropy coefficient + `entropy_coef` is updated each optimizer step: incremented by `entropy_coef_delta` when the current + entropy is below `entropy_target`, and decremented otherwise. The coefficient is only applied when + entropy is at or below `entropy_target`. + entropy_coef_min (`float`, *optional*, defaults to `0.0`): + Lower bound for the entropy coefficient when using adaptive entropy control. + entropy_coef_max (`float`, *optional*, defaults to `1.0`): + Upper bound for the entropy coefficient when using adaptive entropy control. + entropy_coef_delta (`float`, *optional*, defaults to `0.005`): + Step size for adjusting the entropy coefficient at each optimizer step during adaptive entropy control. + entropy_target (`float`, *optional*, defaults to `0.2`): + Target mean per-token entropy (in nats) used by adaptive entropy control. The coefficient is only + applied when the current entropy falls at or below this value. Typical language models have per-token + entropies in the range 2–10 nats; the default of `0.2` nearly always triggers regularization, so users + should tune this to a value appropriate for their model and task. max_tool_calling_iterations (`int`, *optional*): Maximum number of tool-calling turns when training an agent. If `None`, there is no limit and generation stops when the model generates a response turn with no tool calls or when the total response length reaches @@ -832,6 +854,42 @@ class GRPOConfig(_BaseConfig): "non-truncated completions are considered." }, ) + entropy_coef: float = field( + default=0.0, + metadata={ + "help": "Coefficient of the entropy regularization term in the loss. A positive value adds an entropy " + "bonus that encourages exploration. When `use_adaptive_entropy=True`, this serves as the initial " + "coefficient and is updated each optimizer step. Has no effect when set to `0.0` (default)." + }, + ) + use_adaptive_entropy: bool = field( + default=False, + metadata={ + "help": "Whether to use adaptive entropy control, introduced in Skywork-OR1 " + "(https://huggingface.co/papers/2505.22312). When enabled, `entropy_coef` is incremented by " + "`entropy_coef_delta` when entropy is below `entropy_target`, and decremented otherwise." + }, + ) + entropy_coef_min: float = field( + default=0.0, + metadata={"help": "Lower bound for the entropy coefficient when using adaptive entropy control."}, + ) + entropy_coef_max: float = field( + default=1.0, + metadata={"help": "Upper bound for the entropy coefficient when using adaptive entropy control."}, + ) + entropy_coef_delta: float = field( + default=0.005, + metadata={"help": "Step size for adjusting the entropy coefficient during adaptive entropy control."}, + ) + entropy_target: float = field( + default=0.2, + metadata={ + "help": "Target mean per-token entropy (nats) for adaptive entropy control. The coefficient is only " + "applied when current entropy is at or below this value. Typical language models have per-token " + "entropies of 2–10 nats; the default of 0.2 nearly always triggers regularization, so tune this." + }, + ) max_tool_calling_iterations: int | None = field( default=None, metadata={ From dcaaf676b7733ff7c190413fc47382e0977ff18d Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 22 Jun 2026 13:21:20 +0200 Subject: [PATCH 02/37] Add init fields to GRPOTrainer --- trl/trainer/grpo_trainer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index a7a673e8784..5d6b71ffc8c 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -667,6 +667,10 @@ def __init__( f"Unknown importance sampling level: {self.importance_sampling_level}. " "Possible values are 'token' and 'sequence'." ) + self.entropy_coef = args.entropy_coef + self.use_adaptive_entropy = args.use_adaptive_entropy + if self.use_liger_kernel and self.entropy_coef != 0.0: + raise NotImplementedError("Entropy bonus is not supported with Liger kernel.") # Datasets self.shuffle_dataset = args.shuffle_dataset From 0f6306e748ec62f56da01e1ce24645183cfc6a2e Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 22 Jun 2026 13:27:28 +0200 Subject: [PATCH 03/37] Update _compute_loss --- trl/trainer/grpo_trainer.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 5d6b71ffc8c..24c436beaf1 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2740,6 +2740,33 @@ def _compute_loss(self, model, inputs): else: raise ValueError(f"Unknown loss type: {self.loss_type}") + # Entropy bonus: add entropy regularization to encourage exploration + if self.entropy_coef != 0.0: + if self.loss_type in ["grpo", "sapo", "luspo"]: + entropy_loss = ((entropies * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() / normalizer + elif self.loss_type == "bnpo": + entropy_loss = (entropies * mask).sum() / mask.sum().clamp(min=1.0) / normalizer + elif self.loss_type == "dr_grpo": + entropy_loss = (entropies * mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer + elif self.loss_type in ["cispo", "dapo", "vespo"]: + entropy_loss = (entropies * mask).sum() / normalizer + + world_entropy = self.accelerator.reduce(entropy_loss.detach(), reduction="mean").item() + if self.use_adaptive_entropy: + if world_entropy < self.args.entropy_target: + self.entropy_coef = min( + self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max + ) + else: + self.entropy_coef = max( + self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min + ) + apply_coef = self.entropy_coef if world_entropy <= self.args.entropy_target else 0.0 + else: + apply_coef = self.entropy_coef + + loss = loss - apply_coef * entropy_loss + # The policy loss above is scaled for gradient accumulation (HF auto-scaling is off here), so scale aux too if self.aux_loss_enabled: normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0 @@ -2755,6 +2782,11 @@ def masked_batch_mean(x): else: return (x * mask).sum() / completion_token_count + self._metrics[mode]["policy_loss"].append(self.accelerator.reduce(loss.detach(), reduction="mean").item()) + if self.entropy_coef != 0.0: + self._metrics[mode]["entropy_loss"].append(world_entropy) + self._metrics[mode]["entropy_coef"].append(self.entropy_coef) + if self.beta != 0.0: mean_kl = masked_batch_mean(per_token_kl) self._metrics[mode]["kl"].append(self.accelerator.gather(mean_kl).nanmean().item()) From 9b1cc655cd9a8802b4fce438b507ab4ea67b1ef6 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 22 Jun 2026 13:27:55 +0200 Subject: [PATCH 04/37] Add checkpoint persistence --- trl/trainer/grpo_trainer.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 24c436beaf1..95e04e79fb2 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -17,6 +17,7 @@ import copy import importlib.resources as pkg_resources import inspect +import json import math import os import sys @@ -2921,3 +2922,16 @@ def _save_checkpoint(self, model, trial): model_name = self.args.hub_model_id.split("/")[-1] self.create_model_card(model_name=model_name) super()._save_checkpoint(model, trial) + if self.use_adaptive_entropy and self.is_world_process_zero(): + checkpoint_folder = f"checkpoint-{self.state.global_step}" + output_dir = os.path.join(self.args.output_dir, checkpoint_folder) + with open(os.path.join(output_dir, "entropy_ctrl_state.json"), "w") as f: + json.dump({"entropy_coef": self.entropy_coef}, f) + + def _load_optimizer_and_scheduler(self, checkpoint): + super()._load_optimizer_and_scheduler(checkpoint) + if self.use_adaptive_entropy and checkpoint is not None: + path = os.path.join(checkpoint, "entropy_ctrl_state.json") + if os.path.exists(path): + with open(path) as f: + self.entropy_coef = json.load(f)["entropy_coef"] From e9447139352eb96fb0db579c2f3e843b92db14a3 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 22 Jun 2026 13:28:58 +0200 Subject: [PATCH 05/37] Update GRPO docs --- docs/source/grpo_trainer.md | 43 +++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md index 0621d5ee689..940f5947026 100644 --- a/docs/source/grpo_trainer.md +++ b/docs/source/grpo_trainer.md @@ -185,7 +185,10 @@ While training and evaluating, we record the following reward metrics: - `reward`: The overall average reward after summing rewards across functions (weighted by `reward_weights`). - `reward_std`: The standard deviation of summed rewards across functions (weighted by `reward_weights`), computed over the full batch. - `frac_reward_zero_std`: The fraction of samples in the generation batch with a reward std of zero, implying there is little diversity for that prompt (all answers are correct or incorrect). +- `policy_loss`: The policy gradient loss value (before any entropy bonus). - `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.) +- `entropy_loss`: The aggregated entropy used as the regularization term. Logged only if `entropy_coef` is nonzero. +- `entropy_coef`: The current entropy coefficient. Logged only if `entropy_coef` is nonzero. Changes each optimizer step when `use_adaptive_entropy=True`. - `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero. - `clip_ratio/region_mean`: The ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities where the GRPO objective is clipped to stay within the trust region: \\( \text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \quad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})} \\). A higher value means more tokens are clipped, which constrains how much the policy $\pi_\theta$ can change. - `clip_ratio/low_mean`: The average ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the lower bound of the trust region: \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\). @@ -641,6 +644,46 @@ and the reward will be computed as the sum of the rewards from each function, or Note that [`GRPOTrainer`] supports multiple reward functions of different types. See the parameters documentation for more details. +### Entropy regularization + +To encourage exploration and prevent the policy from collapsing to near-deterministic outputs, you can add an entropy bonus to the training objective. The entropy regularization augments the GRPO loss as follows: + +$$ +\mathcal{L}(\theta) = \mathcal{L}_{\text{GRPO}}(\theta) - \alpha \cdot \mathcal{H}(\pi_\theta), +$$ + +where \\(\mathcal{H}(\pi_\theta)\\) is the mean per-token entropy of the policy and \\(\alpha\\) is the entropy coefficient. + +**Static entropy** — a fixed coefficient throughout training: + +```python +from trl import GRPOConfig, GRPOTrainer + +training_args = GRPOConfig(entropy_coef=0.05, ...) +``` + +**Adaptive entropy** — the coefficient is updated each optimizer step based on a target entropy, as introduced in [Skywork-OR1](https://huggingface.co/papers/2505.22312). When the current entropy falls at or below `entropy_target`, the coefficient is incremented by `entropy_coef_delta`; otherwise it is decremented. The coefficient is only applied (i.e. non-zero) while entropy is at or below the target: + +```python +training_args = GRPOConfig( + entropy_coef=0.01, # initial coefficient + use_adaptive_entropy=True, + entropy_target=5.0, # target mean per-token entropy (nats); tune for your model + entropy_coef_delta=0.005, # step size per optimizer step + entropy_coef_min=0.0, + entropy_coef_max=1.0, + ... +) +``` + + + +Typical language models have per-token entropies of 2–10 nats. The default `entropy_target=0.2` nearly always triggers regularization; set it to a value meaningful for your model (e.g. the entropy you observe early in training, logged as the `entropy` metric). + + + +When `use_adaptive_entropy=True`, the current entropy coefficient `entropy_coef` is saved alongside each checkpoint and restored on resume, so training is fully resumable. + ### Rapid Experimentation for GRPO RapidFire AI is an open-source experimentation engine that sits on top of TRL and lets you launch multiple GRPO configurations at once, even on a single GPU. Instead of trying configurations sequentially, RapidFire lets you **see all their learning curves earlier, stop underperforming runs, and clone promising ones with new settings in flight** without restarting. For more information, see [RapidFire AI Integration](rapidfire_integration). From f47d5a58c29ef7781d542533c885160f2b7a1515 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 22 Jun 2026 13:29:30 +0200 Subject: [PATCH 06/37] Add tests --- tests/test_grpo_trainer.py | 68 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py index d7a0b9960da..2401b8323c5 100644 --- a/tests/test_grpo_trainer.py +++ b/tests/test_grpo_trainer.py @@ -1474,6 +1474,74 @@ def test_train_with_cast_lm_head_to_fp32(self, model_name): new_param = trainer.model.get_parameter(n) assert not torch.equal(param, new_param), f"Parameter {n} has not changed." + def test_train_with_static_entropy(self): + dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train") + training_args = GRPOConfig( + output_dir=self.tmp_dir, + learning_rate=0.1, # use higher lr because gradients are tiny and default lr can stall updates + per_device_train_batch_size=3, # reduce the batch size to reduce memory usage + num_generations=3, # reduce the number of generations to reduce memory usage + max_completion_length=8, # reduce the completion length to reduce memory usage + report_to="none", + entropy_coef=0.1, + ) + trainer = GRPOTrainer( + model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5", + reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5", + args=training_args, + train_dataset=dataset, + ) + + previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()} + + trainer.train() + + assert trainer.state.log_history[-1]["train_loss"] is not None + assert trainer.state.log_history[-1]["policy_loss"] is not None + assert trainer.state.log_history[-1]["entropy_loss"] is not None + assert trainer.state.log_history[-1]["entropy_coef"] is not None + + # Check that the params have changed + for n, param in previous_trainable_params.items(): + new_param = trainer.model.get_parameter(n) + assert not torch.equal(param, new_param), f"Parameter {n} has not changed." + + def test_train_with_adaptive_entropy(self): + dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train") + training_args = GRPOConfig( + output_dir=self.tmp_dir, + learning_rate=0.1, # use higher lr because gradients are tiny and default lr can stall updates + per_device_train_batch_size=3, # reduce the batch size to reduce memory usage + num_generations=3, # reduce the number of generations to reduce memory usage + max_completion_length=8, # reduce the completion length to reduce memory usage + report_to="none", + entropy_coef=0.01, + use_adaptive_entropy=True, + entropy_target=15.0, # above any realistic entropy → coef is always incremented + ) + trainer = GRPOTrainer( + model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5", + reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5", + args=training_args, + train_dataset=dataset, + ) + + previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()} + + trainer.train() + + assert trainer.state.log_history[-1]["train_loss"] is not None + assert trainer.state.log_history[-1]["policy_loss"] is not None + assert trainer.state.log_history[-1]["entropy_loss"] is not None + assert trainer.state.log_history[-1]["entropy_coef"] is not None + # Coefficient should have increased since entropy < target throughout training + assert trainer.entropy_coef > 0.01 + + # Check that the params have changed + for n, param in previous_trainable_params.items(): + new_param = trainer.model.get_parameter(n) + assert not torch.equal(param, new_param), f"Parameter {n} has not changed." + def test_train_with_entropy_filter(self): dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train") training_args = GRPOConfig( From 2484e70c30e2f7f5a443bed84ef5fd6b6a6c1ded Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 22 Jun 2026 12:48:14 +0000 Subject: [PATCH 07/37] Address issues from review --- docs/source/grpo_trainer.md | 4 ++-- trl/trainer/grpo_trainer.py | 32 ++++++++++++++++++++------------ 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md index 940f5947026..c738b26a0c9 100644 --- a/docs/source/grpo_trainer.md +++ b/docs/source/grpo_trainer.md @@ -187,8 +187,8 @@ While training and evaluating, we record the following reward metrics: - `frac_reward_zero_std`: The fraction of samples in the generation batch with a reward std of zero, implying there is little diversity for that prompt (all answers are correct or incorrect). - `policy_loss`: The policy gradient loss value (before any entropy bonus). - `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.) -- `entropy_loss`: The aggregated entropy used as the regularization term. Logged only if `entropy_coef` is nonzero. -- `entropy_coef`: The current entropy coefficient. Logged only if `entropy_coef` is nonzero. Changes each optimizer step when `use_adaptive_entropy=True`. +- `entropy_loss`: The aggregated entropy used as the regularization term. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. +- `entropy_coef`: The current entropy coefficient. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. Updated once per optimizer step when `use_adaptive_entropy=True`. - `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero. - `clip_ratio/region_mean`: The ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities where the GRPO objective is clipped to stay within the trust region: \\( \text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \quad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})} \\). A higher value means more tokens are clipped, which constrains how much the policy $\pi_\theta$ can change. - `clip_ratio/low_mean`: The average ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the lower bound of the trust region: \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\). diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 95e04e79fb2..1b8c4ad2e48 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2741,8 +2741,14 @@ def _compute_loss(self, model, inputs): else: raise ValueError(f"Unknown loss type: {self.loss_type}") - # Entropy bonus: add entropy regularization to encourage exploration - if self.entropy_coef != 0.0: + # Capture the pure policy loss for logging before entropy/aux modify it + policy_loss = loss.detach() + + # Entropy bonus: add entropy regularization to encourage exploration. + # Gate: run whenever a non-zero static coef is set OR adaptive mode is enabled. Adaptive must always run even + # when self.entropy_coef has been decremented to entropy_coef_min (default 0) so it can recover once entropy + # drops below entropy_target again. + if self.entropy_coef != 0.0 or self.use_adaptive_entropy: if self.loss_type in ["grpo", "sapo", "luspo"]: entropy_loss = ((entropies * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() / normalizer elif self.loss_type == "bnpo": @@ -2754,14 +2760,16 @@ def _compute_loss(self, model, inputs): world_entropy = self.accelerator.reduce(entropy_loss.detach(), reduction="mean").item() if self.use_adaptive_entropy: - if world_entropy < self.args.entropy_target: - self.entropy_coef = min( - self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max - ) - else: - self.entropy_coef = max( - self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min - ) + # Update the coefficient once per optimizer step, not per micro-batch + if self.accelerator.sync_gradients: + if world_entropy < self.args.entropy_target: + self.entropy_coef = min( + self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max + ) + else: + self.entropy_coef = max( + self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min + ) apply_coef = self.entropy_coef if world_entropy <= self.args.entropy_target else 0.0 else: apply_coef = self.entropy_coef @@ -2783,8 +2791,8 @@ def masked_batch_mean(x): else: return (x * mask).sum() / completion_token_count - self._metrics[mode]["policy_loss"].append(self.accelerator.reduce(loss.detach(), reduction="mean").item()) - if self.entropy_coef != 0.0: + self._metrics[mode]["policy_loss"].append(self.accelerator.reduce(policy_loss, reduction="mean").item()) + if self.entropy_coef != 0.0 or self.use_adaptive_entropy: self._metrics[mode]["entropy_loss"].append(world_entropy) self._metrics[mode]["entropy_coef"].append(self.entropy_coef) From 45077474beff6be14afae94b44946a66f4326cbc Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 22 Jun 2026 13:04:57 +0000 Subject: [PATCH 08/37] Fix wrong entropy for adaptive control --- docs/source/grpo_trainer.md | 2 +- trl/trainer/grpo_trainer.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md index c738b26a0c9..7f19eb03cc1 100644 --- a/docs/source/grpo_trainer.md +++ b/docs/source/grpo_trainer.md @@ -187,7 +187,7 @@ While training and evaluating, we record the following reward metrics: - `frac_reward_zero_std`: The fraction of samples in the generation batch with a reward std of zero, implying there is little diversity for that prompt (all answers are correct or incorrect). - `policy_loss`: The policy gradient loss value (before any entropy bonus). - `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.) -- `entropy_loss`: The aggregated entropy used as the regularization term. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. +- `entropy_loss`: Mean per-token entropy (nats) used as the regularization signal. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. - `entropy_coef`: The current entropy coefficient. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. Updated once per optimizer step when `use_adaptive_entropy=True`. - `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero. - `clip_ratio/region_mean`: The ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities where the GRPO objective is clipped to stay within the trust region: \\( \text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \quad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})} \\). A higher value means more tokens are clipped, which constrains how much the policy $\pi_\theta$ can change. diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 1b8c4ad2e48..250a7650153 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2758,7 +2758,11 @@ def _compute_loss(self, model, inputs): elif self.loss_type in ["cispo", "dapo", "vespo"]: entropy_loss = (entropies * mask).sum() / normalizer - world_entropy = self.accelerator.reduce(entropy_loss.detach(), reduction="mean").item() + # Mean per-token entropy in nats across ranks — computed independently of the loss + # normalizer so its scale matches entropy_target (loss-scaled entropy_loss would not). + world_entropy = self.accelerator.reduce( + ((entropies * mask).sum() / mask.sum().clamp(min=1.0)).detach(), reduction="mean" + ).item() if self.use_adaptive_entropy: # Update the coefficient once per optimizer step, not per micro-batch if self.accelerator.sync_gradients: From 9b70a4a7986234deabd4fd5480d10340c68bd8f3 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 22 Jun 2026 13:05:23 +0000 Subject: [PATCH 09/37] Fix Liger skips adaptive entropy guard --- trl/trainer/grpo_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 250a7650153..89d2b1e1044 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -670,7 +670,7 @@ def __init__( ) self.entropy_coef = args.entropy_coef self.use_adaptive_entropy = args.use_adaptive_entropy - if self.use_liger_kernel and self.entropy_coef != 0.0: + if self.use_liger_kernel and (self.entropy_coef != 0.0 or self.use_adaptive_entropy): raise NotImplementedError("Entropy bonus is not supported with Liger kernel.") # Datasets From 9d79e4a8e6896a5ad40b967e0ff8f62eb1c95e1c Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 22 Jun 2026 13:49:03 +0000 Subject: [PATCH 10/37] Fix inconsistent inequality --- trl/trainer/grpo_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 89d2b1e1044..b4f18fe59b7 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2766,7 +2766,7 @@ def _compute_loss(self, model, inputs): if self.use_adaptive_entropy: # Update the coefficient once per optimizer step, not per micro-batch if self.accelerator.sync_gradients: - if world_entropy < self.args.entropy_target: + if world_entropy <= self.args.entropy_target: self.entropy_coef = min( self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max ) From 46c8a64f6ba5ad156cbcbc24a1a2a7f2d4b89575 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 22 Jun 2026 13:55:09 +0000 Subject: [PATCH 11/37] Fix mean reduction with sum-count-divide --- trl/trainer/grpo_trainer.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index b4f18fe59b7..b93369fc27e 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2758,11 +2758,13 @@ def _compute_loss(self, model, inputs): elif self.loss_type in ["cispo", "dapo", "vespo"]: entropy_loss = (entropies * mask).sum() / normalizer - # Mean per-token entropy in nats across ranks — computed independently of the loss - # normalizer so its scale matches entropy_target (loss-scaled entropy_loss would not). - world_entropy = self.accelerator.reduce( - ((entropies * mask).sum() / mask.sum().clamp(min=1.0)).detach(), reduction="mean" - ).item() + # True global mean per-token entropy (nats): reduce sum and token count jointly so + # that ranks with fewer tokens don't get equal weight (averaging per-rank means would + # be biased when completion lengths differ across ranks). + entropy_stats = self.accelerator.reduce( + torch.stack([(entropies * mask).sum(), mask.sum()]).detach(), reduction="sum" + ) + world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item() if self.use_adaptive_entropy: # Update the coefficient once per optimizer step, not per micro-batch if self.accelerator.sync_gradients: From 3f7a6692080ae0f0b6e64804aa4ef8b8f64522ac Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 22 Jun 2026 14:19:33 +0000 Subject: [PATCH 12/37] Set _last_world_entropy at init --- trl/trainer/grpo_trainer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index b93369fc27e..b1727d58d62 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -670,6 +670,9 @@ def __init__( ) self.entropy_coef = args.entropy_coef self.use_adaptive_entropy = args.use_adaptive_entropy + # Cached entropy from the last optimizer step; inf so the first accumulation window + # applies no bonus until a real measurement arrives (conservative default). + self._last_world_entropy = float("inf") if self.use_liger_kernel and (self.entropy_coef != 0.0 or self.use_adaptive_entropy): raise NotImplementedError("Entropy bonus is not supported with Liger kernel.") From a05c97907367a3f0e7754d6b0189b57f1183b1be Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 22 Jun 2026 14:21:15 +0000 Subject: [PATCH 13/37] Cache world_entropy at sync point and use that cached value for apply_coef --- trl/trainer/grpo_trainer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index b1727d58d62..0d00f40257b 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2769,7 +2769,10 @@ def _compute_loss(self, model, inputs): ) world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item() if self.use_adaptive_entropy: - # Update the coefficient once per optimizer step, not per micro-batch + # Update coefficient and cache entropy once per optimizer step, not per micro-batch. + # apply_coef uses the cached value so all micro-batches within one accumulation + # window apply the same bonus (using per-micro-batch world_entropy would cause + # the bonus to toggle on/off unpredictably across accumulation steps). if self.accelerator.sync_gradients: if world_entropy <= self.args.entropy_target: self.entropy_coef = min( @@ -2779,7 +2782,8 @@ def _compute_loss(self, model, inputs): self.entropy_coef = max( self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min ) - apply_coef = self.entropy_coef if world_entropy <= self.args.entropy_target else 0.0 + self._last_world_entropy = world_entropy + apply_coef = self.entropy_coef if self._last_world_entropy <= self.args.entropy_target else 0.0 else: apply_coef = self.entropy_coef From fe03dd1037fb4c79e25ae2bbe9dee686db17ccad Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 22 Jun 2026 14:30:58 +0000 Subject: [PATCH 14/37] Persist also _last_world_entropy --- trl/trainer/grpo_trainer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 0d00f40257b..7ace9470a58 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2947,7 +2947,7 @@ def _save_checkpoint(self, model, trial): checkpoint_folder = f"checkpoint-{self.state.global_step}" output_dir = os.path.join(self.args.output_dir, checkpoint_folder) with open(os.path.join(output_dir, "entropy_ctrl_state.json"), "w") as f: - json.dump({"entropy_coef": self.entropy_coef}, f) + json.dump({"entropy_coef": self.entropy_coef, "last_world_entropy": self._last_world_entropy}, f) def _load_optimizer_and_scheduler(self, checkpoint): super()._load_optimizer_and_scheduler(checkpoint) @@ -2955,4 +2955,6 @@ def _load_optimizer_and_scheduler(self, checkpoint): path = os.path.join(checkpoint, "entropy_ctrl_state.json") if os.path.exists(path): with open(path) as f: - self.entropy_coef = json.load(f)["entropy_coef"] + state = json.load(f) + self.entropy_coef = state["entropy_coef"] + self._last_world_entropy = state.get("last_world_entropy", float("inf")) From f099349fa7d7d2fc5a74868b6f56e1b70b33d9ed Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 22 Jun 2026 14:31:24 +0000 Subject: [PATCH 15/37] Add paper_index entry --- docs/source/paper_index.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/docs/source/paper_index.md b/docs/source/paper_index.md index e8491893389..ea85a7b8594 100644 --- a/docs/source/paper_index.md +++ b/docs/source/paper_index.md @@ -225,6 +225,27 @@ training_args = GRPOConfig( ) ``` +### Skywork-OR1: Open Reasoning Models + +**📜 Paper**: https://huggingface.co/papers/2505.22312 + +Skywork-OR1 is a family of open reasoning models trained with GRPO. The paper introduces **adaptive entropy control**: an entropy regularization term `−α·H(π_θ)` is added to the GRPO objective, and the coefficient `α` is automatically adjusted each optimizer step. When the model's mean per-token entropy falls at or below a target, `α` is incremented to encourage more exploration; otherwise it is decremented. The bonus is only applied while entropy is at or below the target. To replicate this adaptive entropy control, use the following configuration: + +```python +from trl import GRPOConfig, GRPOTrainer + +training_args = GRPOConfig( + use_adaptive_entropy=True, # enable adaptive entropy control (Section 3.3 of the paper) + entropy_coef=0.01, # initial entropy regularization coefficient + entropy_target=5.0, # target mean per-token entropy (nats); tune for your model + entropy_coef_delta=0.005, # step size for coefficient updates per optimizer step +) +trainer = GRPOTrainer( + ..., + args=training_args, +) +``` + ### Beyond the 80/20 Rule: High-Entropy Minority Tokens Drive Effective Reinforcement Learning for LLM Reasoning **📜 Paper**: https://huggingface.co/papers/2506.01939 From 5288cd5e2987a0952e17d9760346e66715dec537 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 24 Jun 2026 13:18:06 +0000 Subject: [PATCH 16/37] Capture the pure policy loss before normalization --- trl/trainer/grpo_trainer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 7ace9470a58..9c0c34a46a4 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2724,29 +2724,31 @@ def _compute_loss(self, model, inputs): if self.loss_type in ["grpo", "sapo"]: loss = ((per_token_loss * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0 # no accum in eval + policy_loss = loss.detach() loss = loss / normalizer elif self.loss_type == "bnpo": loss = (per_token_loss * mask).sum() / mask.sum().clamp(min=1.0) normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0 # no accum in eval + policy_loss = loss.detach() loss = loss / normalizer elif self.loss_type == "dr_grpo": loss = (per_token_loss * mask).sum() / (per_token_loss.size(0) * self.max_completion_length) normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0 # no accum in eval + policy_loss = loss.detach() loss = loss / normalizer elif self.loss_type in ["cispo", "dapo", "vespo"]: normalizer = inputs["num_items_in_batch"] / self.accelerator.num_processes loss = (per_token_loss * mask).sum() / normalizer + policy_loss = loss.detach() elif self.loss_type == "luspo": # Unless importance_sampling_level="token" (not recommended here), per_token_loss is expected to be (B, 1) loss = (per_token_loss * mask.sum(1, keepdim=True)).mean() normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0 + policy_loss = loss.detach() loss = loss / normalizer else: raise ValueError(f"Unknown loss type: {self.loss_type}") - # Capture the pure policy loss for logging before entropy/aux modify it - policy_loss = loss.detach() - # Entropy bonus: add entropy regularization to encourage exploration. # Gate: run whenever a non-zero static coef is set OR adaptive mode is enabled. Adaptive must always run even # when self.entropy_coef has been decremented to entropy_coef_min (default 0) so it can recover once entropy From 03f4208c02b3ae854ef8378360d20b340590d9d9 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 24 Jun 2026 13:18:28 +0000 Subject: [PATCH 17/37] Fix luspo loss --- trl/trainer/grpo_trainer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 9c0c34a46a4..5c06e0bc368 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2754,7 +2754,7 @@ def _compute_loss(self, model, inputs): # when self.entropy_coef has been decremented to entropy_coef_min (default 0) so it can recover once entropy # drops below entropy_target again. if self.entropy_coef != 0.0 or self.use_adaptive_entropy: - if self.loss_type in ["grpo", "sapo", "luspo"]: + if self.loss_type in ["grpo", "sapo"]: entropy_loss = ((entropies * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() / normalizer elif self.loss_type == "bnpo": entropy_loss = (entropies * mask).sum() / mask.sum().clamp(min=1.0) / normalizer @@ -2762,6 +2762,9 @@ def _compute_loss(self, model, inputs): entropy_loss = (entropies * mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer elif self.loss_type in ["cispo", "dapo", "vespo"]: entropy_loss = (entropies * mask).sum() / normalizer + elif self.loss_type == "luspo": + # luspo weights each sequence by its token count, so entropy is summed (not per-token averaged) per sequence + entropy_loss = (entropies * mask).sum(-1).mean() / normalizer # True global mean per-token entropy (nats): reduce sum and token count jointly so # that ranks with fewer tokens don't get equal weight (averaging per-rank means would From dbc0c7592c6437d0c45b2cdc78043e90c7cdc75d Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 24 Jun 2026 13:19:43 +0000 Subject: [PATCH 18/37] Gate policy_loss logging and align style --- trl/trainer/grpo_trainer.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 5c06e0bc368..db4926a71ec 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2794,6 +2794,10 @@ def _compute_loss(self, model, inputs): loss = loss - apply_coef * entropy_loss + self._metrics[mode]["policy_loss"].append(self.accelerator.gather(policy_loss).nanmean().item()) + self._metrics[mode]["entropy_loss"].append(world_entropy) + self._metrics[mode]["entropy_coef"].append(self.entropy_coef) + # The policy loss above is scaled for gradient accumulation (HF auto-scaling is off here), so scale aux too if self.aux_loss_enabled: normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0 @@ -2809,11 +2813,6 @@ def masked_batch_mean(x): else: return (x * mask).sum() / completion_token_count - self._metrics[mode]["policy_loss"].append(self.accelerator.reduce(policy_loss, reduction="mean").item()) - if self.entropy_coef != 0.0 or self.use_adaptive_entropy: - self._metrics[mode]["entropy_loss"].append(world_entropy) - self._metrics[mode]["entropy_coef"].append(self.entropy_coef) - if self.beta != 0.0: mean_kl = masked_batch_mean(per_token_kl) self._metrics[mode]["kl"].append(self.accelerator.gather(mean_kl).nanmean().item()) From 506fbf9f4cd8b838b90c6ed6982ae432c7baf70c Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 24 Jun 2026 14:01:00 +0000 Subject: [PATCH 19/37] Fix entropy state written to wrong path --- trl/trainer/grpo_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index aeef9676067..1576425522d 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2959,7 +2959,7 @@ def _save_checkpoint(self, model, trial): super()._save_checkpoint(model, trial) if self.use_adaptive_entropy and self.is_world_process_zero(): checkpoint_folder = f"checkpoint-{self.state.global_step}" - output_dir = os.path.join(self.args.output_dir, checkpoint_folder) + output_dir = os.path.join(self._get_output_dir(trial=trial), checkpoint_folder) with open(os.path.join(output_dir, "entropy_ctrl_state.json"), "w") as f: json.dump({"entropy_coef": self.entropy_coef, "last_world_entropy": self._last_world_entropy}, f) From 8a6b53dde71d9340a2491bbc83744fca9e35484d Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 24 Jun 2026 14:01:43 +0000 Subject: [PATCH 20/37] Fix is_world_process_zero() vs args.should_save guard mismatch --- trl/trainer/grpo_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 1576425522d..e6f60c1831f 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2957,7 +2957,7 @@ def _save_checkpoint(self, model, trial): model_name = self.args.hub_model_id.split("/")[-1] self.create_model_card(model_name=model_name) super()._save_checkpoint(model, trial) - if self.use_adaptive_entropy and self.is_world_process_zero(): + if self.use_adaptive_entropy and self.args.should_save: checkpoint_folder = f"checkpoint-{self.state.global_step}" output_dir = os.path.join(self._get_output_dir(trial=trial), checkpoint_folder) with open(os.path.join(output_dir, "entropy_ctrl_state.json"), "w") as f: From 474b30c479c795403a041439e781c66d9974e345 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 24 Jun 2026 14:03:03 +0000 Subject: [PATCH 21/37] Update docs: policy_loss only logged inside entropy block --- docs/source/grpo_trainer.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md index 7f19eb03cc1..bdd366ee3d5 100644 --- a/docs/source/grpo_trainer.md +++ b/docs/source/grpo_trainer.md @@ -185,7 +185,7 @@ While training and evaluating, we record the following reward metrics: - `reward`: The overall average reward after summing rewards across functions (weighted by `reward_weights`). - `reward_std`: The standard deviation of summed rewards across functions (weighted by `reward_weights`), computed over the full batch. - `frac_reward_zero_std`: The fraction of samples in the generation batch with a reward std of zero, implying there is little diversity for that prompt (all answers are correct or incorrect). -- `policy_loss`: The policy gradient loss value (before any entropy bonus). +- `policy_loss`: The policy gradient loss value (before any entropy bonus). Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. - `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.) - `entropy_loss`: Mean per-token entropy (nats) used as the regularization signal. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. - `entropy_coef`: The current entropy coefficient. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. Updated once per optimizer step when `use_adaptive_entropy=True`. From a0b9ec68aa11fe3399dc503796cc4296d3240b8e Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 24 Jun 2026 14:04:37 +0000 Subject: [PATCH 22/37] Log entropy_coef only when sync_gradients=True --- trl/trainer/grpo_trainer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index e6f60c1831f..03bbbdda207 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2806,7 +2806,10 @@ def _compute_loss(self, model, inputs): self._metrics[mode]["policy_loss"].append(self.accelerator.gather(policy_loss).nanmean().item()) self._metrics[mode]["entropy_loss"].append(world_entropy) - self._metrics[mode]["entropy_coef"].append(self.entropy_coef) + # Log entropy_coef only on optimizer-step boundaries: it updates once per step (sync_gradients), + # so logging K identical values per step would dilute the metric with stale data. + if self.accelerator.sync_gradients: + self._metrics[mode]["entropy_coef"].append(self.entropy_coef) # The policy loss above is scaled for gradient accumulation (HF auto-scaling is off here), so scale aux too if self.aux_loss_enabled: From 608b1e0a8772697eebf4b442dbebddd1cdde80eb Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 24 Jun 2026 14:06:30 +0000 Subject: [PATCH 23/37] Add guard for entropy-loss dispatch matching policy-loss dispatch --- trl/trainer/grpo_trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 03bbbdda207..387eacd15fb 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2775,6 +2775,8 @@ def _compute_loss(self, model, inputs): elif self.loss_type == "luspo": # luspo weights each sequence by its token count, so entropy is summed (not per-token averaged) per sequence entropy_loss = (entropies * mask).sum(-1).mean() / normalizer + else: + raise ValueError(f"Unknown loss type: {self.loss_type}") # True global mean per-token entropy (nats): reduce sum and token count jointly so # that ranks with fewer tokens don't get equal weight (averaging per-rank means would From 81841ad99cb4e0f94ddc2fbffaf5b66286a674f8 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 24 Jun 2026 14:18:38 +0000 Subject: [PATCH 24/37] Remove entropy_loss --- docs/source/grpo_trainer.md | 3 +-- tests/test_grpo_trainer.py | 2 -- trl/trainer/grpo_trainer.py | 20 ++++++++------------ 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md index bdd366ee3d5..e6fb82f1f88 100644 --- a/docs/source/grpo_trainer.md +++ b/docs/source/grpo_trainer.md @@ -187,8 +187,7 @@ While training and evaluating, we record the following reward metrics: - `frac_reward_zero_std`: The fraction of samples in the generation batch with a reward std of zero, implying there is little diversity for that prompt (all answers are correct or incorrect). - `policy_loss`: The policy gradient loss value (before any entropy bonus). Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. - `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.) -- `entropy_loss`: Mean per-token entropy (nats) used as the regularization signal. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. -- `entropy_coef`: The current entropy coefficient. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. Updated once per optimizer step when `use_adaptive_entropy=True`. +- `entropy_coef`: The current entropy regularization coefficient. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. Updated once per optimizer step when `use_adaptive_entropy=True`. - `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero. - `clip_ratio/region_mean`: The ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities where the GRPO objective is clipped to stay within the trust region: \\( \text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \quad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})} \\). A higher value means more tokens are clipped, which constrains how much the policy $\pi_\theta$ can change. - `clip_ratio/low_mean`: The average ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the lower bound of the trust region: \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\). diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py index 186a815d2eb..29f07aff5ff 100644 --- a/tests/test_grpo_trainer.py +++ b/tests/test_grpo_trainer.py @@ -1498,7 +1498,6 @@ def test_train_with_static_entropy(self): assert trainer.state.log_history[-1]["train_loss"] is not None assert trainer.state.log_history[-1]["policy_loss"] is not None - assert trainer.state.log_history[-1]["entropy_loss"] is not None assert trainer.state.log_history[-1]["entropy_coef"] is not None # Check that the params have changed @@ -1532,7 +1531,6 @@ def test_train_with_adaptive_entropy(self): assert trainer.state.log_history[-1]["train_loss"] is not None assert trainer.state.log_history[-1]["policy_loss"] is not None - assert trainer.state.log_history[-1]["entropy_loss"] is not None assert trainer.state.log_history[-1]["entropy_coef"] is not None # Coefficient should have increased since entropy < target throughout training assert trainer.entropy_coef > 0.01 diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 387eacd15fb..21fadcc16b6 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2778,18 +2778,15 @@ def _compute_loss(self, model, inputs): else: raise ValueError(f"Unknown loss type: {self.loss_type}") - # True global mean per-token entropy (nats): reduce sum and token count jointly so - # that ranks with fewer tokens don't get equal weight (averaging per-rank means would - # be biased when completion lengths differ across ranks). - entropy_stats = self.accelerator.reduce( - torch.stack([(entropies * mask).sum(), mask.sum()]).detach(), reduction="sum" - ) - world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item() if self.use_adaptive_entropy: - # Update coefficient and cache entropy once per optimizer step, not per micro-batch. - # apply_coef uses the cached value so all micro-batches within one accumulation - # window apply the same bonus (using per-micro-batch world_entropy would cause - # the bonus to toggle on/off unpredictably across accumulation steps). + # Reduce sum and token count jointly for a true global mean (unbiased when ranks + # have different completion lengths). Update coefficient and cache entropy once per + # optimizer step; apply_coef uses the cached value so all micro-batches within one + # accumulation window apply the same bonus. + entropy_stats = self.accelerator.reduce( + torch.stack([(entropies * mask).sum(), mask.sum()]).detach(), reduction="sum" + ) + world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item() if self.accelerator.sync_gradients: if world_entropy <= self.args.entropy_target: self.entropy_coef = min( @@ -2807,7 +2804,6 @@ def _compute_loss(self, model, inputs): loss = loss - apply_coef * entropy_loss self._metrics[mode]["policy_loss"].append(self.accelerator.gather(policy_loss).nanmean().item()) - self._metrics[mode]["entropy_loss"].append(world_entropy) # Log entropy_coef only on optimizer-step boundaries: it updates once per step (sync_gradients), # so logging K identical values per step would dilute the metric with stale data. if self.accelerator.sync_gradients: From bee5126842614b1d3fbdf0df01df498d95ca5c3b Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 24 Jun 2026 14:22:53 +0000 Subject: [PATCH 25/37] Gate on train mode to avoid entropy state update during eval --- trl/trainer/grpo_trainer.py | 44 +++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 21fadcc16b6..bebb43a4481 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2779,24 +2779,26 @@ def _compute_loss(self, model, inputs): raise ValueError(f"Unknown loss type: {self.loss_type}") if self.use_adaptive_entropy: - # Reduce sum and token count jointly for a true global mean (unbiased when ranks - # have different completion lengths). Update coefficient and cache entropy once per - # optimizer step; apply_coef uses the cached value so all micro-batches within one - # accumulation window apply the same bonus. - entropy_stats = self.accelerator.reduce( - torch.stack([(entropies * mask).sum(), mask.sum()]).detach(), reduction="sum" - ) - world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item() - if self.accelerator.sync_gradients: - if world_entropy <= self.args.entropy_target: - self.entropy_coef = min( - self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max - ) - else: - self.entropy_coef = max( - self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min - ) - self._last_world_entropy = world_entropy + if mode == "train": + # Reduce sum and token count jointly for a true global mean (unbiased when ranks + # have different completion lengths). Update coefficient and cache entropy once per + # optimizer step; apply_coef uses the cached value so all micro-batches within one + # accumulation window apply the same bonus. Gated on train mode so evaluation + # cannot mutate the entropy controller state. + entropy_stats = self.accelerator.reduce( + torch.stack([(entropies * mask).sum(), mask.sum()]).detach(), reduction="sum" + ) + world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item() + if self.accelerator.sync_gradients: + if world_entropy <= self.args.entropy_target: + self.entropy_coef = min( + self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max + ) + else: + self.entropy_coef = max( + self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min + ) + self._last_world_entropy = world_entropy apply_coef = self.entropy_coef if self._last_world_entropy <= self.args.entropy_target else 0.0 else: apply_coef = self.entropy_coef @@ -2804,9 +2806,9 @@ def _compute_loss(self, model, inputs): loss = loss - apply_coef * entropy_loss self._metrics[mode]["policy_loss"].append(self.accelerator.gather(policy_loss).nanmean().item()) - # Log entropy_coef only on optimizer-step boundaries: it updates once per step (sync_gradients), - # so logging K identical values per step would dilute the metric with stale data. - if self.accelerator.sync_gradients: + # Log entropy_coef only on train optimizer-step boundaries: it updates once per step + # (sync_gradients), and sync_gradients is always True in eval (no accumulation context). + if mode == "train" and self.accelerator.sync_gradients: self._metrics[mode]["entropy_coef"].append(self.entropy_coef) # The policy loss above is scaled for gradient accumulation (HF auto-scaling is off here), so scale aux too From 2f34d156c5cc069fa5a2b7d58b1ce7b6b0c079b8 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 24 Jun 2026 14:57:02 +0000 Subject: [PATCH 26/37] Fix entropy bonus ignores quantile mask --- trl/trainer/grpo_trainer.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index bebb43a4481..9b1a5a1c79e 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2764,17 +2764,22 @@ def _compute_loss(self, model, inputs): # when self.entropy_coef has been decremented to entropy_coef_min (default 0) so it can recover once entropy # drops below entropy_target again. if self.entropy_coef != 0.0 or self.use_adaptive_entropy: + # When top_entropy_quantile < 1.0, entropy_mask restricts policy gradients to high-entropy + # tokens. Use the same effective mask for the entropy bonus so it acts on the same tokens. + effective_mask = mask if entropy_mask is None else mask * entropy_mask if self.loss_type in ["grpo", "sapo"]: - entropy_loss = ((entropies * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() / normalizer + entropy_loss = ((entropies * effective_mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() / normalizer elif self.loss_type == "bnpo": - entropy_loss = (entropies * mask).sum() / mask.sum().clamp(min=1.0) / normalizer + entropy_loss = (entropies * effective_mask).sum() / mask.sum().clamp(min=1.0) / normalizer elif self.loss_type == "dr_grpo": - entropy_loss = (entropies * mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer + entropy_loss = ( + (entropies * effective_mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer + ) elif self.loss_type in ["cispo", "dapo", "vespo"]: - entropy_loss = (entropies * mask).sum() / normalizer + entropy_loss = (entropies * effective_mask).sum() / normalizer elif self.loss_type == "luspo": # luspo weights each sequence by its token count, so entropy is summed (not per-token averaged) per sequence - entropy_loss = (entropies * mask).sum(-1).mean() / normalizer + entropy_loss = (entropies * effective_mask).sum(-1).mean() / normalizer else: raise ValueError(f"Unknown loss type: {self.loss_type}") From 806078dda8370ee5479fcb5d50730ed3c5c826a4 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 24 Jun 2026 15:05:48 +0000 Subject: [PATCH 27/37] Use effective_mask for the world_entropy all-reduce too --- trl/trainer/grpo_trainer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 9b1a5a1c79e..8107d5827b8 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2791,7 +2791,8 @@ def _compute_loss(self, model, inputs): # accumulation window apply the same bonus. Gated on train mode so evaluation # cannot mutate the entropy controller state. entropy_stats = self.accelerator.reduce( - torch.stack([(entropies * mask).sum(), mask.sum()]).detach(), reduction="sum" + torch.stack([(entropies * effective_mask).sum(), effective_mask.sum()]).detach(), + reduction="sum", ) world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item() if self.accelerator.sync_gradients: From 2845ef455f3132a1caad88dc0bb6bca0a799f147 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 24 Jun 2026 15:06:11 +0000 Subject: [PATCH 28/37] Update docs --- docs/source/grpo_trainer.md | 2 +- trl/trainer/grpo_config.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md index e6fb82f1f88..0cb882a6846 100644 --- a/docs/source/grpo_trainer.md +++ b/docs/source/grpo_trainer.md @@ -677,7 +677,7 @@ training_args = GRPOConfig( -Typical language models have per-token entropies of 2–10 nats. The default `entropy_target=0.2` nearly always triggers regularization; set it to a value meaningful for your model (e.g. the entropy you observe early in training, logged as the `entropy` metric). +Typical language models have per-token entropies of 2–10 nats. The default `entropy_target=0.2` nearly always triggers regularization; set it to a value meaningful for your model (e.g. the entropy you observe early in training, logged as the `entropy` metric). When using `top_entropy_quantile < 1.0`, `entropy_target` applies to the high-entropy token subset — that subset's entropy will be higher than the logged full-token `entropy`, so calibrate accordingly. diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py index 230c8f50c10..a83dd7c0349 100644 --- a/trl/trainer/grpo_config.py +++ b/trl/trainer/grpo_config.py @@ -314,9 +314,11 @@ class GRPOConfig(_BaseConfig): Step size for adjusting the entropy coefficient at each optimizer step during adaptive entropy control. entropy_target (`float`, *optional*, defaults to `0.2`): Target mean per-token entropy (in nats) used by adaptive entropy control. The coefficient is only - applied when the current entropy falls at or below this value. Typical language models have per-token - entropies in the range 2–10 nats; the default of `0.2` nearly always triggers regularization, so users - should tune this to a value appropriate for their model and task. + applied when the current entropy falls at or below this value. Measured over the same token set as + the policy loss: all completion tokens by default, or only the high-entropy subset when + `top_entropy_quantile < 1.0`. Typical language models have per-token entropies in the range 2–10 + nats; the default of `0.2` nearly always triggers regularization, so users should tune this to a + value appropriate for their model and task (and token subset when using `top_entropy_quantile`). max_tool_calling_iterations (`int`, *optional*): Maximum number of tool-calling turns when training an agent. If `None`, there is no limit and generation stops when the model generates a response turn with no tool calls or when the total response length reaches @@ -886,8 +888,10 @@ class GRPOConfig(_BaseConfig): default=0.2, metadata={ "help": "Target mean per-token entropy (nats) for adaptive entropy control. The coefficient is only " - "applied when current entropy is at or below this value. Typical language models have per-token " - "entropies of 2–10 nats; the default of 0.2 nearly always triggers regularization, so tune this." + "applied when current entropy is at or below this value. Measured over the same token set as the " + "policy loss (all completion tokens, or the high-entropy subset when top_entropy_quantile < 1.0). " + "Typical language models have per-token entropies of 2–10 nats; the default of 0.2 nearly always " + "triggers regularization, so tune this." }, ) max_tool_calling_iterations: int | None = field( From 2ed11c0a7d9f43d65364dd1b20a05a66f0c4d87b Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 24 Jun 2026 15:25:11 +0000 Subject: [PATCH 29/37] Use unified formula with mean per-token entropy of active tokens --- trl/trainer/grpo_trainer.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 8107d5827b8..305fcbb375e 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2767,21 +2767,10 @@ def _compute_loss(self, model, inputs): # When top_entropy_quantile < 1.0, entropy_mask restricts policy gradients to high-entropy # tokens. Use the same effective mask for the entropy bonus so it acts on the same tokens. effective_mask = mask if entropy_mask is None else mask * entropy_mask - if self.loss_type in ["grpo", "sapo"]: - entropy_loss = ((entropies * effective_mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() / normalizer - elif self.loss_type == "bnpo": - entropy_loss = (entropies * effective_mask).sum() / mask.sum().clamp(min=1.0) / normalizer - elif self.loss_type == "dr_grpo": - entropy_loss = ( - (entropies * effective_mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer - ) - elif self.loss_type in ["cispo", "dapo", "vespo"]: - entropy_loss = (entropies * effective_mask).sum() / normalizer - elif self.loss_type == "luspo": - # luspo weights each sequence by its token count, so entropy is summed (not per-token averaged) per sequence - entropy_loss = (entropies * effective_mask).sum(-1).mean() / normalizer - else: - raise ValueError(f"Unknown loss type: {self.loss_type}") + # Mean per-active-token entropy, scaled for gradient accumulation like the policy loss. + # Uniform across all loss types so entropy_target and entropy_coef have consistent units + # (per-token nats) and match the world_entropy computed in the adaptive block below. + entropy_loss = (entropies * effective_mask).sum() / effective_mask.sum().clamp(min=1.0) / normalizer if self.use_adaptive_entropy: if mode == "train": From 76255d3016e79ceace966c6bf50e6e3ff60b448f Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 25 Jun 2026 14:52:43 +0200 Subject: [PATCH 30/37] Make three-branch entropy-loss split --- trl/trainer/grpo_trainer.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 50e8337e77b..280ea1eb446 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2767,10 +2767,21 @@ def _compute_loss(self, model, inputs): # When top_entropy_quantile < 1.0, entropy_mask restricts policy gradients to high-entropy # tokens. Use the same effective mask for the entropy bonus so it acts on the same tokens. effective_mask = mask if entropy_mask is None else mask * entropy_mask - # Mean per-active-token entropy, scaled for gradient accumulation like the policy loss. - # Uniform across all loss types so entropy_target and entropy_coef have consistent units - # (per-token nats) and match the world_entropy computed in the adaptive block below. - entropy_loss = (entropies * effective_mask).sum() / effective_mask.sum().clamp(min=1.0) / normalizer + # The entropy bonus must be normalized exactly like each loss type's policy loss, so that + # entropy_coef stays on a consistent scale and gradient accumulation remains correct. The + # normalizer differs by loss type: it is the gradient accumulation step count for the grpo + # family, but a global token count for the cispo/dapo/vespo family. + if self.loss_type in ["cispo", "dapo", "vespo"]: + # normalizer is a global token count, so summing the entropies accumulates over the + # optimizer step to the global token-weighted mean entropy, matching world_entropy below. + entropy_loss = (entropies * effective_mask).sum() / normalizer + elif self.loss_type == "luspo": + # luspo weights each sequence by its token count, so entropy is summed per sequence (not + # per-token averaged) to stay on the same scale as the policy loss. + entropy_loss = (entropies * effective_mask).sum(-1).mean() / normalizer + else: # grpo, sapo, bnpo, dr_grpo: normalizer is the gradient accumulation step count + # Token-weighted mean entropy of active tokens, matching world_entropy below. + entropy_loss = (entropies * effective_mask).sum() / effective_mask.sum().clamp(min=1.0) / normalizer if self.use_adaptive_entropy: if mode == "train": From fc76d4b497dab5310be5b222945162851041fb18 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 25 Jun 2026 14:53:49 +0200 Subject: [PATCH 31/37] Compute bonus from frozen state, update per optimizer step --- trl/trainer/grpo_trainer.py | 51 ++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 280ea1eb446..886e9064a87 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2783,28 +2783,10 @@ def _compute_loss(self, model, inputs): # Token-weighted mean entropy of active tokens, matching world_entropy below. entropy_loss = (entropies * effective_mask).sum() / effective_mask.sum().clamp(min=1.0) / normalizer + # Apply the coefficient and gating from the end of the previous optimizer step, so that every + # micro-batch in the current accumulation window applies the same entropy bonus. The adaptive + # update below only takes effect on the next step. if self.use_adaptive_entropy: - if mode == "train": - # Reduce sum and token count jointly for a true global mean (unbiased when ranks - # have different completion lengths). Update coefficient and cache entropy once per - # optimizer step; apply_coef uses the cached value so all micro-batches within one - # accumulation window apply the same bonus. Gated on train mode so evaluation - # cannot mutate the entropy controller state. - entropy_stats = self.accelerator.reduce( - torch.stack([(entropies * effective_mask).sum(), effective_mask.sum()]).detach(), - reduction="sum", - ) - world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item() - if self.accelerator.sync_gradients: - if world_entropy <= self.args.entropy_target: - self.entropy_coef = min( - self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max - ) - else: - self.entropy_coef = max( - self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min - ) - self._last_world_entropy = world_entropy apply_coef = self.entropy_coef if self._last_world_entropy <= self.args.entropy_target else 0.0 else: apply_coef = self.entropy_coef @@ -2812,8 +2794,31 @@ def _compute_loss(self, model, inputs): loss = loss - apply_coef * entropy_loss self._metrics[mode]["policy_loss"].append(self.accelerator.gather(policy_loss).nanmean().item()) - # Log entropy_coef only on train optimizer-step boundaries: it updates once per step - # (sync_gradients), and sync_gradients is always True in eval (no accumulation context). + + # Adaptive update: once per optimizer step, measure the global token-weighted entropy and adjust + # the coefficient for the next step. Gated on train mode so evaluation cannot mutate the entropy + # controller state, and on sync_gradients so the all-reduce runs once per optimizer step rather + # than on every micro-batch of the accumulation window. + if self.use_adaptive_entropy and mode == "train" and self.accelerator.sync_gradients: + # Reduce sum and token count jointly for a true global mean (unbiased when ranks have + # different completion lengths). + entropy_stats = self.accelerator.reduce( + torch.stack([(entropies * effective_mask).sum(), effective_mask.sum()]).detach(), + reduction="sum", + ) + world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item() + if world_entropy <= self.args.entropy_target: + self.entropy_coef = min( + self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max + ) + else: + self.entropy_coef = max( + self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min + ) + self._last_world_entropy = world_entropy + + # Log entropy_coef on train optimizer-step boundaries (constant for static control; updated just + # above for adaptive control). sync_gradients is always True in eval (no accumulation context). if mode == "train" and self.accelerator.sync_gradients: self._metrics[mode]["entropy_coef"].append(self.entropy_coef) From bed5188839e63bfac5deefd74b7fb39eced5f770 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 25 Jun 2026 15:00:44 +0200 Subject: [PATCH 32/37] Fix "nearly always triggers" docs --- docs/source/grpo_trainer.md | 2 +- trl/trainer/grpo_config.py | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md index 0cb882a6846..c05cbd389b7 100644 --- a/docs/source/grpo_trainer.md +++ b/docs/source/grpo_trainer.md @@ -677,7 +677,7 @@ training_args = GRPOConfig( -Typical language models have per-token entropies of 2–10 nats. The default `entropy_target=0.2` nearly always triggers regularization; set it to a value meaningful for your model (e.g. the entropy you observe early in training, logged as the `entropy` metric). When using `top_entropy_quantile < 1.0`, `entropy_target` applies to the high-entropy token subset — that subset's entropy will be higher than the logged full-token `entropy`, so calibrate accordingly. +Typical language models have per-token entropies of 2–10 nats, so the default `entropy_target=0.2` almost never triggers regularization — the bonus only engages once entropy is at or below the target, i.e. near-complete collapse. Set it to a value meaningful for your model, e.g. close to the entropy you observe early in training (logged as the `entropy` metric). When using `top_entropy_quantile < 1.0`, `entropy_target` applies to the high-entropy token subset — that subset's entropy will be higher than the logged full-token `entropy`, so calibrate accordingly. diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py index a83dd7c0349..5c469aecfdc 100644 --- a/trl/trainer/grpo_config.py +++ b/trl/trainer/grpo_config.py @@ -317,8 +317,10 @@ class GRPOConfig(_BaseConfig): applied when the current entropy falls at or below this value. Measured over the same token set as the policy loss: all completion tokens by default, or only the high-entropy subset when `top_entropy_quantile < 1.0`. Typical language models have per-token entropies in the range 2–10 - nats; the default of `0.2` nearly always triggers regularization, so users should tune this to a - value appropriate for their model and task (and token subset when using `top_entropy_quantile`). + nats, so the default of `0.2` almost never triggers regularization (only on near-complete entropy + collapse); set it close to the entropy you observe early in training (logged as the `entropy` + metric) so the bonus engages before the policy collapses (and account for the token subset when + using `top_entropy_quantile`). max_tool_calling_iterations (`int`, *optional*): Maximum number of tool-calling turns when training an agent. If `None`, there is no limit and generation stops when the model generates a response turn with no tool calls or when the total response length reaches @@ -890,8 +892,9 @@ class GRPOConfig(_BaseConfig): "help": "Target mean per-token entropy (nats) for adaptive entropy control. The coefficient is only " "applied when current entropy is at or below this value. Measured over the same token set as the " "policy loss (all completion tokens, or the high-entropy subset when top_entropy_quantile < 1.0). " - "Typical language models have per-token entropies of 2–10 nats; the default of 0.2 nearly always " - "triggers regularization, so tune this." + "Typical language models have per-token entropies of 2–10 nats, so the default of 0.2 almost never " + "triggers regularization (only on near-complete collapse); set it close to the entropy observed " + "early in training and tune from there." }, ) max_tool_calling_iterations: int | None = field( From 6e8f498ed10927d40131bfd5e6d4be587bdc23bd Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 25 Jun 2026 15:02:28 +0200 Subject: [PATCH 33/37] Add scale test and grad-accumulation adaptive test --- tests/test_grpo_trainer.py | 87 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py index 29f07aff5ff..8c816bcb33a 100644 --- a/tests/test_grpo_trainer.py +++ b/tests/test_grpo_trainer.py @@ -1540,6 +1540,93 @@ def test_train_with_adaptive_entropy(self): new_param = trainer.model.get_parameter(n) assert not torch.equal(param, new_param), f"Parameter {n} has not changed." + @pytest.mark.parametrize("loss_type", ["grpo", "dapo", "luspo"]) + def test_entropy_bonus_scale(self, loss_type): + # Regression test: the entropy bonus must be normalized like each loss type's policy loss. A previous + # "unified" formula divided the per-token mean entropy by the loss normalizer, which for the + # cispo/dapo/vespo family is a global token count, making the bonus ~1/sequence_length too small; and + # it put luspo's bonus on the per-token scale instead of luspo's sequence-weighted scale. With + # gradient_accumulation_steps=1 the per-step entropy contribution to the loss is + # contrib = policy_loss - loss = entropy_coef * entropy_loss, so contrib / entropy reveals the scale. + entropy_coef = 0.5 + dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train") + training_args = GRPOConfig( + output_dir=self.tmp_dir, + importance_sampling_level="sequence" if loss_type == "luspo" else "token", + learning_rate=0.1, # use higher lr because gradients are tiny and default lr can stall updates + per_device_train_batch_size=3, # reduce the batch size to reduce memory usage + num_generations=3, # reduce the number of generations to reduce memory usage + max_completion_length=16, # long enough that the per-token vs sequence-weighted scales differ + gradient_accumulation_steps=1, # so contrib == entropy_coef * entropy_loss holds per step + loss_type=loss_type, + logging_steps=1, + report_to="none", + entropy_coef=entropy_coef, + ) + trainer = GRPOTrainer( + model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5", + reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5", + args=training_args, + train_dataset=dataset, + ) + + trainer.train() + + logs = [h for h in trainer.state.log_history if "policy_loss" in h and "loss" in h and h.get("entropy")] + assert logs + ratios = sorted((h["policy_loss"] - h["loss"]) / h["entropy"] for h in logs) + ratio = ratios[len(ratios) // 2] # median, robust to per-step noise + if loss_type == "luspo": + # luspo weights each sequence by its length, so the bonus is the per-sequence entropy sum: its + # scale is entropy_coef * (mean sequence length), well above entropy_coef. The buggy per-token + # formula gave ratio == entropy_coef. + assert ratio > 1.5 * entropy_coef + else: + # grpo (and the cispo/dapo/vespo family) regularize the per-token mean entropy, so the bonus is + # exactly entropy_coef * entropy. The buggy formula made dapo's ratio smaller by ~1/seq_len. + assert ratio == pytest.approx(entropy_coef, rel=0.3) + + def test_train_with_adaptive_entropy_gradient_accumulation(self): + # Adaptive entropy must behave correctly under gradient accumulation: the coefficient and gating are + # frozen across an accumulation window and the controller updates once per optimizer step (not once + # per micro-batch). With entropy_target above any realistic entropy the coefficient is incremented by + # entropy_coef_delta on every optimizer step, so the final value pins down the number of updates. + dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train") + training_args = GRPOConfig( + output_dir=self.tmp_dir, + learning_rate=0.1, # use higher lr because gradients are tiny and default lr can stall updates + per_device_train_batch_size=3, # reduce the batch size to reduce memory usage + num_generations=3, # reduce the number of generations to reduce memory usage + max_completion_length=8, # reduce the completion length to reduce memory usage + gradient_accumulation_steps=2, # exercise the accumulation window + report_to="none", + entropy_coef=0.01, + use_adaptive_entropy=True, + entropy_target=15.0, # above any realistic entropy → coef incremented once per optimizer step + entropy_coef_delta=0.005, + ) + trainer = GRPOTrainer( + model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5", + reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5", + args=training_args, + train_dataset=dataset, + ) + + previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()} + + trainer.train() + + assert trainer.state.log_history[-1]["train_loss"] is not None + # Exactly one increment per optimizer step (global_step counts optimizer steps, not micro-batches); + # a per-micro-batch update would overshoot this. + expected_coef = min(0.01 + 0.005 * trainer.state.global_step, 1.0) + assert trainer.entropy_coef == pytest.approx(expected_coef, abs=1e-6) + + # Check that the params have changed + for n, param in previous_trainable_params.items(): + new_param = trainer.model.get_parameter(n) + assert not torch.equal(param, new_param), f"Parameter {n} has not changed." + def test_train_with_entropy_filter(self): dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train") training_args = GRPOConfig( From 607d911d95cfcdd631f8164fb307b4ca433a7a6c Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 25 Jun 2026 15:47:33 +0200 Subject: [PATCH 34/37] Fix dr_grpo entropy scale mismatch --- trl/trainer/grpo_trainer.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 886e9064a87..c2e4b211c8c 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2775,11 +2775,18 @@ def _compute_loss(self, model, inputs): # normalizer is a global token count, so summing the entropies accumulates over the # optimizer step to the global token-weighted mean entropy, matching world_entropy below. entropy_loss = (entropies * effective_mask).sum() / normalizer + elif self.loss_type == "dr_grpo": + # Dr. GRPO normalizes by the fixed budget (batch size × max completion length) instead of the + # actual token count, to remove length bias; scale the entropy bonus the same way so that + # entropy_coef stays consistent with the policy term when completions are shorter than the max. + entropy_loss = ( + (entropies * effective_mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer + ) elif self.loss_type == "luspo": # luspo weights each sequence by its token count, so entropy is summed per sequence (not # per-token averaged) to stay on the same scale as the policy loss. entropy_loss = (entropies * effective_mask).sum(-1).mean() / normalizer - else: # grpo, sapo, bnpo, dr_grpo: normalizer is the gradient accumulation step count + else: # grpo, sapo, bnpo: normalizer is the gradient accumulation step count # Token-weighted mean entropy of active tokens, matching world_entropy below. entropy_loss = (entropies * effective_mask).sum() / effective_mask.sum().clamp(min=1.0) / normalizer From 0cfad37a15b4eeb45a888f7b428b74999c4c934d Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 25 Jun 2026 16:07:55 +0200 Subject: [PATCH 35/37] Accumulate to mean per-token entropy, independent of how each loss type normalizes --- trl/trainer/grpo_trainer.py | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index c2e4b211c8c..ead9707d764 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -2767,27 +2767,18 @@ def _compute_loss(self, model, inputs): # When top_entropy_quantile < 1.0, entropy_mask restricts policy gradients to high-entropy # tokens. Use the same effective mask for the entropy bonus so it acts on the same tokens. effective_mask = mask if entropy_mask is None else mask * entropy_mask - # The entropy bonus must be normalized exactly like each loss type's policy loss, so that - # entropy_coef stays on a consistent scale and gradient accumulation remains correct. The - # normalizer differs by loss type: it is the gradient accumulation step count for the grpo - # family, but a global token count for the cispo/dapo/vespo family. + # Entropy bonus = mean per-token entropy H (the documented objective L = L_policy - coef * H), so + # H does not depend on how each loss type normalizes its policy term. The term is computed so that + # it accumulates to H over the optimizer step for every loss type and matches world_entropy below. + # The only wrinkle is the normalizer: most loss types divide by the gradient accumulation step + # count, but cispo/dapo/vespo divide by a global token count. if self.loss_type in ["cispo", "dapo", "vespo"]: - # normalizer is a global token count, so summing the entropies accumulates over the - # optimizer step to the global token-weighted mean entropy, matching world_entropy below. + # normalizer is a global token count, so summing the entropies (instead of averaging them + # again) makes the term accumulate over the optimizer step to the global mean per-token + # entropy, like the other loss types. entropy_loss = (entropies * effective_mask).sum() / normalizer - elif self.loss_type == "dr_grpo": - # Dr. GRPO normalizes by the fixed budget (batch size × max completion length) instead of the - # actual token count, to remove length bias; scale the entropy bonus the same way so that - # entropy_coef stays consistent with the policy term when completions are shorter than the max. - entropy_loss = ( - (entropies * effective_mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer - ) - elif self.loss_type == "luspo": - # luspo weights each sequence by its token count, so entropy is summed per sequence (not - # per-token averaged) to stay on the same scale as the policy loss. - entropy_loss = (entropies * effective_mask).sum(-1).mean() / normalizer - else: # grpo, sapo, bnpo: normalizer is the gradient accumulation step count - # Token-weighted mean entropy of active tokens, matching world_entropy below. + else: + # Mean per-token entropy of active tokens, scaled for gradient accumulation. entropy_loss = (entropies * effective_mask).sum() / effective_mask.sum().clamp(min=1.0) / normalizer # Apply the coefficient and gating from the end of the previous optimizer step, so that every From 8e05132281547ac1222936c68479528176fcaa3f Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 25 Jun 2026 16:08:14 +0200 Subject: [PATCH 36/37] Update tests --- tests/test_grpo_trainer.py | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py index 8c816bcb33a..875ee9b3b78 100644 --- a/tests/test_grpo_trainer.py +++ b/tests/test_grpo_trainer.py @@ -1540,14 +1540,16 @@ def test_train_with_adaptive_entropy(self): new_param = trainer.model.get_parameter(n) assert not torch.equal(param, new_param), f"Parameter {n} has not changed." - @pytest.mark.parametrize("loss_type", ["grpo", "dapo", "luspo"]) + @pytest.mark.parametrize("loss_type", ["grpo", "dr_grpo", "dapo", "luspo"]) def test_entropy_bonus_scale(self, loss_type): - # Regression test: the entropy bonus must be normalized like each loss type's policy loss. A previous - # "unified" formula divided the per-token mean entropy by the loss normalizer, which for the - # cispo/dapo/vespo family is a global token count, making the bonus ~1/sequence_length too small; and - # it put luspo's bonus on the per-token scale instead of luspo's sequence-weighted scale. With - # gradient_accumulation_steps=1 the per-step entropy contribution to the loss is - # contrib = policy_loss - loss = entropy_coef * entropy_loss, so contrib / entropy reveals the scale. + # Regression test: the entropy bonus is the mean per-token entropy H for every loss type (documented + # objective L = L_policy - entropy_coef * H), so it must not inherit any loss-type-specific policy + # normalization. A previous "unified" formula divided H by a global token count for the + # cispo/dapo/vespo family, making the bonus ~1/sequence_length too small; conversely, scaling the + # bonus like the dr_grpo (fixed budget) or luspo (sequence-weighted) policy term would also be wrong. + # With gradient_accumulation_steps=1 the per-step entropy contribution to the loss is + # contrib = policy_loss - loss = entropy_coef * entropy_loss, so contrib / entropy must equal + # entropy_coef for all loss types. entropy_coef = 0.5 dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train") training_args = GRPOConfig( @@ -1556,7 +1558,7 @@ def test_entropy_bonus_scale(self, loss_type): learning_rate=0.1, # use higher lr because gradients are tiny and default lr can stall updates per_device_train_batch_size=3, # reduce the batch size to reduce memory usage num_generations=3, # reduce the number of generations to reduce memory usage - max_completion_length=16, # long enough that the per-token vs sequence-weighted scales differ + max_completion_length=16, # reduce the completion length to reduce memory usage gradient_accumulation_steps=1, # so contrib == entropy_coef * entropy_loss holds per step loss_type=loss_type, logging_steps=1, @@ -1576,15 +1578,8 @@ def test_entropy_bonus_scale(self, loss_type): assert logs ratios = sorted((h["policy_loss"] - h["loss"]) / h["entropy"] for h in logs) ratio = ratios[len(ratios) // 2] # median, robust to per-step noise - if loss_type == "luspo": - # luspo weights each sequence by its length, so the bonus is the per-sequence entropy sum: its - # scale is entropy_coef * (mean sequence length), well above entropy_coef. The buggy per-token - # formula gave ratio == entropy_coef. - assert ratio > 1.5 * entropy_coef - else: - # grpo (and the cispo/dapo/vespo family) regularize the per-token mean entropy, so the bonus is - # exactly entropy_coef * entropy. The buggy formula made dapo's ratio smaller by ~1/seq_len. - assert ratio == pytest.approx(entropy_coef, rel=0.3) + # Every loss type regularizes the mean per-token entropy, so contrib == entropy_coef * entropy. + assert ratio == pytest.approx(entropy_coef, rel=0.3) def test_train_with_adaptive_entropy_gradient_accumulation(self): # Adaptive entropy must behave correctly under gradient accumulation: the coefficient and gating are From bccd8ebbf274edf12ef295c15e0f0e1ae578b306 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Fri, 26 Jun 2026 07:23:05 +0200 Subject: [PATCH 37/37] Add clarifying sentence --- docs/source/grpo_trainer.md | 2 +- trl/trainer/grpo_config.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md index c05cbd389b7..b1e0050e1a6 100644 --- a/docs/source/grpo_trainer.md +++ b/docs/source/grpo_trainer.md @@ -651,7 +651,7 @@ $$ \mathcal{L}(\theta) = \mathcal{L}_{\text{GRPO}}(\theta) - \alpha \cdot \mathcal{H}(\pi_\theta), $$ -where \\(\mathcal{H}(\pi_\theta)\\) is the mean per-token entropy of the policy and \\(\alpha\\) is the entropy coefficient. +where \\(\mathcal{H}(\pi_\theta)\\) is the mean per-token entropy of the policy and \\(\alpha\\) is the entropy coefficient. The bonus is always the mean per-token entropy regardless of `loss_type`; it is not rescaled to match a loss type's policy normalization (e.g. Dr. GRPO's `batch_size * max_completion_length` denominator), so `entropy_coef` has the same meaning for every loss type. **Static entropy** — a fixed coefficient throughout training: diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py index 5c469aecfdc..46fce0e13a9 100644 --- a/trl/trainer/grpo_config.py +++ b/trl/trainer/grpo_config.py @@ -297,7 +297,9 @@ class GRPOConfig(_BaseConfig): `mask_truncated_completions=True`, only tokens from non-truncated completions are considered. entropy_coef (`float`, *optional*, defaults to `0.0`): Coefficient of the entropy regularization term in the loss. A positive value adds an entropy bonus that - encourages exploration by keeping the policy from collapsing to near-deterministic outputs. When + encourages exploration by keeping the policy from collapsing to near-deterministic outputs. The bonus is + always the mean per-token entropy regardless of `loss_type`; it is not rescaled to match a loss type's + policy normalization, so `entropy_coef` has the same meaning for every loss type. When `use_adaptive_entropy=True`, this serves as the initial coefficient and is updated each optimizer step. Has no effect when set to `0.0` (default). use_adaptive_entropy (`bool`, *optional*, defaults to `False`): @@ -862,7 +864,9 @@ class GRPOConfig(_BaseConfig): default=0.0, metadata={ "help": "Coefficient of the entropy regularization term in the loss. A positive value adds an entropy " - "bonus that encourages exploration. When `use_adaptive_entropy=True`, this serves as the initial " + "bonus that encourages exploration. The bonus is always the mean per-token entropy regardless of " + "`loss_type` (not rescaled to a loss type's policy normalization), so `entropy_coef` has the same " + "meaning for every loss type. When `use_adaptive_entropy=True`, this serves as the initial " "coefficient and is updated each optimizer step. Has no effect when set to `0.0` (default)." }, )