From ac50a1127573bee511abced1d0d0d0974e21dcfa Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:20:18 +0200
Subject: [PATCH 01/37] Add fields to GRPOConfig
---
trl/trainer/grpo_config.py | 58 ++++++++++++++++++++++++++++++++++++++
1 file changed, 58 insertions(+)
diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py
index 5736b6c0ddc..230c8f50c10 100644
--- a/trl/trainer/grpo_config.py
+++ b/trl/trainer/grpo_config.py
@@ -295,6 +295,28 @@ class GRPOConfig(_BaseConfig):
position, improving results. Range: `[0.0-1.0]`. A value of `0.0` masks all but the highest entropy token;
`1.0` keeps all tokens. The paper recommends a value of `0.2`. If used with
`mask_truncated_completions=True`, only tokens from non-truncated completions are considered.
+ entropy_coef (`float`, *optional*, defaults to `0.0`):
+ Coefficient of the entropy regularization term in the loss. A positive value adds an entropy bonus that
+ encourages exploration by keeping the policy from collapsing to near-deterministic outputs. When
+ `use_adaptive_entropy=True`, this serves as the initial coefficient and is updated each optimizer step.
+ Has no effect when set to `0.0` (default).
+ use_adaptive_entropy (`bool`, *optional*, defaults to `False`):
+ Whether to use adaptive entropy control, introduced in
+ [Skywork-OR1](https://huggingface.co/papers/2505.22312). When enabled, the entropy coefficient
+ `entropy_coef` is updated each optimizer step: incremented by `entropy_coef_delta` when the current
+ entropy is below `entropy_target`, and decremented otherwise. The coefficient is only applied when
+ entropy is at or below `entropy_target`.
+ entropy_coef_min (`float`, *optional*, defaults to `0.0`):
+ Lower bound for the entropy coefficient when using adaptive entropy control.
+ entropy_coef_max (`float`, *optional*, defaults to `1.0`):
+ Upper bound for the entropy coefficient when using adaptive entropy control.
+ entropy_coef_delta (`float`, *optional*, defaults to `0.005`):
+ Step size for adjusting the entropy coefficient at each optimizer step during adaptive entropy control.
+ entropy_target (`float`, *optional*, defaults to `0.2`):
+ Target mean per-token entropy (in nats) used by adaptive entropy control. The coefficient is only
+ applied when the current entropy falls at or below this value. Typical language models have per-token
+ entropies in the range 2–10 nats; the default of `0.2` nearly always triggers regularization, so users
+ should tune this to a value appropriate for their model and task.
max_tool_calling_iterations (`int`, *optional*):
Maximum number of tool-calling turns when training an agent. If `None`, there is no limit and generation
stops when the model generates a response turn with no tool calls or when the total response length reaches
@@ -832,6 +854,42 @@ class GRPOConfig(_BaseConfig):
"non-truncated completions are considered."
},
)
+ entropy_coef: float = field(
+ default=0.0,
+ metadata={
+ "help": "Coefficient of the entropy regularization term in the loss. A positive value adds an entropy "
+ "bonus that encourages exploration. When `use_adaptive_entropy=True`, this serves as the initial "
+ "coefficient and is updated each optimizer step. Has no effect when set to `0.0` (default)."
+ },
+ )
+ use_adaptive_entropy: bool = field(
+ default=False,
+ metadata={
+ "help": "Whether to use adaptive entropy control, introduced in Skywork-OR1 "
+ "(https://huggingface.co/papers/2505.22312). When enabled, `entropy_coef` is incremented by "
+ "`entropy_coef_delta` when entropy is below `entropy_target`, and decremented otherwise."
+ },
+ )
+ entropy_coef_min: float = field(
+ default=0.0,
+ metadata={"help": "Lower bound for the entropy coefficient when using adaptive entropy control."},
+ )
+ entropy_coef_max: float = field(
+ default=1.0,
+ metadata={"help": "Upper bound for the entropy coefficient when using adaptive entropy control."},
+ )
+ entropy_coef_delta: float = field(
+ default=0.005,
+ metadata={"help": "Step size for adjusting the entropy coefficient during adaptive entropy control."},
+ )
+ entropy_target: float = field(
+ default=0.2,
+ metadata={
+ "help": "Target mean per-token entropy (nats) for adaptive entropy control. The coefficient is only "
+ "applied when current entropy is at or below this value. Typical language models have per-token "
+ "entropies of 2–10 nats; the default of 0.2 nearly always triggers regularization, so tune this."
+ },
+ )
max_tool_calling_iterations: int | None = field(
default=None,
metadata={
From dcaaf676b7733ff7c190413fc47382e0977ff18d Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:21:20 +0200
Subject: [PATCH 02/37] Add init fields to GRPOTrainer
---
trl/trainer/grpo_trainer.py | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index a7a673e8784..5d6b71ffc8c 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -667,6 +667,10 @@ def __init__(
f"Unknown importance sampling level: {self.importance_sampling_level}. "
"Possible values are 'token' and 'sequence'."
)
+ self.entropy_coef = args.entropy_coef
+ self.use_adaptive_entropy = args.use_adaptive_entropy
+ if self.use_liger_kernel and self.entropy_coef != 0.0:
+ raise NotImplementedError("Entropy bonus is not supported with Liger kernel.")
# Datasets
self.shuffle_dataset = args.shuffle_dataset
From 0f6306e748ec62f56da01e1ce24645183cfc6a2e Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:27:28 +0200
Subject: [PATCH 03/37] Update _compute_loss
---
trl/trainer/grpo_trainer.py | 32 ++++++++++++++++++++++++++++++++
1 file changed, 32 insertions(+)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 5d6b71ffc8c..24c436beaf1 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2740,6 +2740,33 @@ def _compute_loss(self, model, inputs):
else:
raise ValueError(f"Unknown loss type: {self.loss_type}")
+ # Entropy bonus: add entropy regularization to encourage exploration
+ if self.entropy_coef != 0.0:
+ if self.loss_type in ["grpo", "sapo", "luspo"]:
+ entropy_loss = ((entropies * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() / normalizer
+ elif self.loss_type == "bnpo":
+ entropy_loss = (entropies * mask).sum() / mask.sum().clamp(min=1.0) / normalizer
+ elif self.loss_type == "dr_grpo":
+ entropy_loss = (entropies * mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer
+ elif self.loss_type in ["cispo", "dapo", "vespo"]:
+ entropy_loss = (entropies * mask).sum() / normalizer
+
+ world_entropy = self.accelerator.reduce(entropy_loss.detach(), reduction="mean").item()
+ if self.use_adaptive_entropy:
+ if world_entropy < self.args.entropy_target:
+ self.entropy_coef = min(
+ self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max
+ )
+ else:
+ self.entropy_coef = max(
+ self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min
+ )
+ apply_coef = self.entropy_coef if world_entropy <= self.args.entropy_target else 0.0
+ else:
+ apply_coef = self.entropy_coef
+
+ loss = loss - apply_coef * entropy_loss
+
# The policy loss above is scaled for gradient accumulation (HF auto-scaling is off here), so scale aux too
if self.aux_loss_enabled:
normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0
@@ -2755,6 +2782,11 @@ def masked_batch_mean(x):
else:
return (x * mask).sum() / completion_token_count
+ self._metrics[mode]["policy_loss"].append(self.accelerator.reduce(loss.detach(), reduction="mean").item())
+ if self.entropy_coef != 0.0:
+ self._metrics[mode]["entropy_loss"].append(world_entropy)
+ self._metrics[mode]["entropy_coef"].append(self.entropy_coef)
+
if self.beta != 0.0:
mean_kl = masked_batch_mean(per_token_kl)
self._metrics[mode]["kl"].append(self.accelerator.gather(mean_kl).nanmean().item())
From 9b1cc655cd9a8802b4fce438b507ab4ea67b1ef6 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:27:55 +0200
Subject: [PATCH 04/37] Add checkpoint persistence
---
trl/trainer/grpo_trainer.py | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 24c436beaf1..95e04e79fb2 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -17,6 +17,7 @@
import copy
import importlib.resources as pkg_resources
import inspect
+import json
import math
import os
import sys
@@ -2921,3 +2922,16 @@ def _save_checkpoint(self, model, trial):
model_name = self.args.hub_model_id.split("/")[-1]
self.create_model_card(model_name=model_name)
super()._save_checkpoint(model, trial)
+ if self.use_adaptive_entropy and self.is_world_process_zero():
+ checkpoint_folder = f"checkpoint-{self.state.global_step}"
+ output_dir = os.path.join(self.args.output_dir, checkpoint_folder)
+ with open(os.path.join(output_dir, "entropy_ctrl_state.json"), "w") as f:
+ json.dump({"entropy_coef": self.entropy_coef}, f)
+
+ def _load_optimizer_and_scheduler(self, checkpoint):
+ super()._load_optimizer_and_scheduler(checkpoint)
+ if self.use_adaptive_entropy and checkpoint is not None:
+ path = os.path.join(checkpoint, "entropy_ctrl_state.json")
+ if os.path.exists(path):
+ with open(path) as f:
+ self.entropy_coef = json.load(f)["entropy_coef"]
From e9447139352eb96fb0db579c2f3e843b92db14a3 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:28:58 +0200
Subject: [PATCH 05/37] Update GRPO docs
---
docs/source/grpo_trainer.md | 43 +++++++++++++++++++++++++++++++++++++
1 file changed, 43 insertions(+)
diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
index 0621d5ee689..940f5947026 100644
--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@@ -185,7 +185,10 @@ While training and evaluating, we record the following reward metrics:
- `reward`: The overall average reward after summing rewards across functions (weighted by `reward_weights`).
- `reward_std`: The standard deviation of summed rewards across functions (weighted by `reward_weights`), computed over the full batch.
- `frac_reward_zero_std`: The fraction of samples in the generation batch with a reward std of zero, implying there is little diversity for that prompt (all answers are correct or incorrect).
+- `policy_loss`: The policy gradient loss value (before any entropy bonus).
- `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.)
+- `entropy_loss`: The aggregated entropy used as the regularization term. Logged only if `entropy_coef` is nonzero.
+- `entropy_coef`: The current entropy coefficient. Logged only if `entropy_coef` is nonzero. Changes each optimizer step when `use_adaptive_entropy=True`.
- `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero.
- `clip_ratio/region_mean`: The ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities where the GRPO objective is clipped to stay within the trust region: \\( \text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \quad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})} \\). A higher value means more tokens are clipped, which constrains how much the policy $\pi_\theta$ can change.
- `clip_ratio/low_mean`: The average ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the lower bound of the trust region: \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\).
@@ -641,6 +644,46 @@ and the reward will be computed as the sum of the rewards from each function, or
Note that [`GRPOTrainer`] supports multiple reward functions of different types. See the parameters documentation for more details.
+### Entropy regularization
+
+To encourage exploration and prevent the policy from collapsing to near-deterministic outputs, you can add an entropy bonus to the training objective. The entropy regularization augments the GRPO loss as follows:
+
+$$
+\mathcal{L}(\theta) = \mathcal{L}_{\text{GRPO}}(\theta) - \alpha \cdot \mathcal{H}(\pi_\theta),
+$$
+
+where \\(\mathcal{H}(\pi_\theta)\\) is the mean per-token entropy of the policy and \\(\alpha\\) is the entropy coefficient.
+
+**Static entropy** — a fixed coefficient throughout training:
+
+```python
+from trl import GRPOConfig, GRPOTrainer
+
+training_args = GRPOConfig(entropy_coef=0.05, ...)
+```
+
+**Adaptive entropy** — the coefficient is updated each optimizer step based on a target entropy, as introduced in [Skywork-OR1](https://huggingface.co/papers/2505.22312). When the current entropy falls at or below `entropy_target`, the coefficient is incremented by `entropy_coef_delta`; otherwise it is decremented. The coefficient is only applied (i.e. non-zero) while entropy is at or below the target:
+
+```python
+training_args = GRPOConfig(
+ entropy_coef=0.01, # initial coefficient
+ use_adaptive_entropy=True,
+ entropy_target=5.0, # target mean per-token entropy (nats); tune for your model
+ entropy_coef_delta=0.005, # step size per optimizer step
+ entropy_coef_min=0.0,
+ entropy_coef_max=1.0,
+ ...
+)
+```
+
+
+
+Typical language models have per-token entropies of 2–10 nats. The default `entropy_target=0.2` nearly always triggers regularization; set it to a value meaningful for your model (e.g. the entropy you observe early in training, logged as the `entropy` metric).
+
+
+
+When `use_adaptive_entropy=True`, the current entropy coefficient `entropy_coef` is saved alongside each checkpoint and restored on resume, so training is fully resumable.
+
### Rapid Experimentation for GRPO
RapidFire AI is an open-source experimentation engine that sits on top of TRL and lets you launch multiple GRPO configurations at once, even on a single GPU. Instead of trying configurations sequentially, RapidFire lets you **see all their learning curves earlier, stop underperforming runs, and clone promising ones with new settings in flight** without restarting. For more information, see [RapidFire AI Integration](rapidfire_integration).
From f47d5a58c29ef7781d542533c885160f2b7a1515 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:29:30 +0200
Subject: [PATCH 06/37] Add tests
---
tests/test_grpo_trainer.py | 68 ++++++++++++++++++++++++++++++++++++++
1 file changed, 68 insertions(+)
diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py
index d7a0b9960da..2401b8323c5 100644
--- a/tests/test_grpo_trainer.py
+++ b/tests/test_grpo_trainer.py
@@ -1474,6 +1474,74 @@ def test_train_with_cast_lm_head_to_fp32(self, model_name):
new_param = trainer.model.get_parameter(n)
assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+ def test_train_with_static_entropy(self):
+ dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+ training_args = GRPOConfig(
+ output_dir=self.tmp_dir,
+ learning_rate=0.1, # use higher lr because gradients are tiny and default lr can stall updates
+ per_device_train_batch_size=3, # reduce the batch size to reduce memory usage
+ num_generations=3, # reduce the number of generations to reduce memory usage
+ max_completion_length=8, # reduce the completion length to reduce memory usage
+ report_to="none",
+ entropy_coef=0.1,
+ )
+ trainer = GRPOTrainer(
+ model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+ reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+ args=training_args,
+ train_dataset=dataset,
+ )
+
+ previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+ trainer.train()
+
+ assert trainer.state.log_history[-1]["train_loss"] is not None
+ assert trainer.state.log_history[-1]["policy_loss"] is not None
+ assert trainer.state.log_history[-1]["entropy_loss"] is not None
+ assert trainer.state.log_history[-1]["entropy_coef"] is not None
+
+ # Check that the params have changed
+ for n, param in previous_trainable_params.items():
+ new_param = trainer.model.get_parameter(n)
+ assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+
+ def test_train_with_adaptive_entropy(self):
+ dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+ training_args = GRPOConfig(
+ output_dir=self.tmp_dir,
+ learning_rate=0.1, # use higher lr because gradients are tiny and default lr can stall updates
+ per_device_train_batch_size=3, # reduce the batch size to reduce memory usage
+ num_generations=3, # reduce the number of generations to reduce memory usage
+ max_completion_length=8, # reduce the completion length to reduce memory usage
+ report_to="none",
+ entropy_coef=0.01,
+ use_adaptive_entropy=True,
+ entropy_target=15.0, # above any realistic entropy → coef is always incremented
+ )
+ trainer = GRPOTrainer(
+ model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+ reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+ args=training_args,
+ train_dataset=dataset,
+ )
+
+ previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+ trainer.train()
+
+ assert trainer.state.log_history[-1]["train_loss"] is not None
+ assert trainer.state.log_history[-1]["policy_loss"] is not None
+ assert trainer.state.log_history[-1]["entropy_loss"] is not None
+ assert trainer.state.log_history[-1]["entropy_coef"] is not None
+ # Coefficient should have increased since entropy < target throughout training
+ assert trainer.entropy_coef > 0.01
+
+ # Check that the params have changed
+ for n, param in previous_trainable_params.items():
+ new_param = trainer.model.get_parameter(n)
+ assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+
def test_train_with_entropy_filter(self):
dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
training_args = GRPOConfig(
From 2484e70c30e2f7f5a443bed84ef5fd6b6a6c1ded Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 12:48:14 +0000
Subject: [PATCH 07/37] Address issues from review
---
docs/source/grpo_trainer.md | 4 ++--
trl/trainer/grpo_trainer.py | 32 ++++++++++++++++++++------------
2 files changed, 22 insertions(+), 14 deletions(-)
diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
index 940f5947026..c738b26a0c9 100644
--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@@ -187,8 +187,8 @@ While training and evaluating, we record the following reward metrics:
- `frac_reward_zero_std`: The fraction of samples in the generation batch with a reward std of zero, implying there is little diversity for that prompt (all answers are correct or incorrect).
- `policy_loss`: The policy gradient loss value (before any entropy bonus).
- `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.)
-- `entropy_loss`: The aggregated entropy used as the regularization term. Logged only if `entropy_coef` is nonzero.
-- `entropy_coef`: The current entropy coefficient. Logged only if `entropy_coef` is nonzero. Changes each optimizer step when `use_adaptive_entropy=True`.
+- `entropy_loss`: The aggregated entropy used as the regularization term. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`.
+- `entropy_coef`: The current entropy coefficient. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. Updated once per optimizer step when `use_adaptive_entropy=True`.
- `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero.
- `clip_ratio/region_mean`: The ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities where the GRPO objective is clipped to stay within the trust region: \\( \text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \quad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})} \\). A higher value means more tokens are clipped, which constrains how much the policy $\pi_\theta$ can change.
- `clip_ratio/low_mean`: The average ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the lower bound of the trust region: \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\).
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 95e04e79fb2..1b8c4ad2e48 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2741,8 +2741,14 @@ def _compute_loss(self, model, inputs):
else:
raise ValueError(f"Unknown loss type: {self.loss_type}")
- # Entropy bonus: add entropy regularization to encourage exploration
- if self.entropy_coef != 0.0:
+ # Capture the pure policy loss for logging before entropy/aux modify it
+ policy_loss = loss.detach()
+
+ # Entropy bonus: add entropy regularization to encourage exploration.
+ # Gate: run whenever a non-zero static coef is set OR adaptive mode is enabled. Adaptive must always run even
+ # when self.entropy_coef has been decremented to entropy_coef_min (default 0) so it can recover once entropy
+ # drops below entropy_target again.
+ if self.entropy_coef != 0.0 or self.use_adaptive_entropy:
if self.loss_type in ["grpo", "sapo", "luspo"]:
entropy_loss = ((entropies * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() / normalizer
elif self.loss_type == "bnpo":
@@ -2754,14 +2760,16 @@ def _compute_loss(self, model, inputs):
world_entropy = self.accelerator.reduce(entropy_loss.detach(), reduction="mean").item()
if self.use_adaptive_entropy:
- if world_entropy < self.args.entropy_target:
- self.entropy_coef = min(
- self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max
- )
- else:
- self.entropy_coef = max(
- self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min
- )
+ # Update the coefficient once per optimizer step, not per micro-batch
+ if self.accelerator.sync_gradients:
+ if world_entropy < self.args.entropy_target:
+ self.entropy_coef = min(
+ self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max
+ )
+ else:
+ self.entropy_coef = max(
+ self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min
+ )
apply_coef = self.entropy_coef if world_entropy <= self.args.entropy_target else 0.0
else:
apply_coef = self.entropy_coef
@@ -2783,8 +2791,8 @@ def masked_batch_mean(x):
else:
return (x * mask).sum() / completion_token_count
- self._metrics[mode]["policy_loss"].append(self.accelerator.reduce(loss.detach(), reduction="mean").item())
- if self.entropy_coef != 0.0:
+ self._metrics[mode]["policy_loss"].append(self.accelerator.reduce(policy_loss, reduction="mean").item())
+ if self.entropy_coef != 0.0 or self.use_adaptive_entropy:
self._metrics[mode]["entropy_loss"].append(world_entropy)
self._metrics[mode]["entropy_coef"].append(self.entropy_coef)
From 45077474beff6be14afae94b44946a66f4326cbc Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:04:57 +0000
Subject: [PATCH 08/37] Fix wrong entropy for adaptive control
---
docs/source/grpo_trainer.md | 2 +-
trl/trainer/grpo_trainer.py | 6 +++++-
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
index c738b26a0c9..7f19eb03cc1 100644
--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@@ -187,7 +187,7 @@ While training and evaluating, we record the following reward metrics:
- `frac_reward_zero_std`: The fraction of samples in the generation batch with a reward std of zero, implying there is little diversity for that prompt (all answers are correct or incorrect).
- `policy_loss`: The policy gradient loss value (before any entropy bonus).
- `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.)
-- `entropy_loss`: The aggregated entropy used as the regularization term. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`.
+- `entropy_loss`: Mean per-token entropy (nats) used as the regularization signal. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`.
- `entropy_coef`: The current entropy coefficient. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. Updated once per optimizer step when `use_adaptive_entropy=True`.
- `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero.
- `clip_ratio/region_mean`: The ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities where the GRPO objective is clipped to stay within the trust region: \\( \text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \quad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})} \\). A higher value means more tokens are clipped, which constrains how much the policy $\pi_\theta$ can change.
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 1b8c4ad2e48..250a7650153 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2758,7 +2758,11 @@ def _compute_loss(self, model, inputs):
elif self.loss_type in ["cispo", "dapo", "vespo"]:
entropy_loss = (entropies * mask).sum() / normalizer
- world_entropy = self.accelerator.reduce(entropy_loss.detach(), reduction="mean").item()
+ # Mean per-token entropy in nats across ranks — computed independently of the loss
+ # normalizer so its scale matches entropy_target (loss-scaled entropy_loss would not).
+ world_entropy = self.accelerator.reduce(
+ ((entropies * mask).sum() / mask.sum().clamp(min=1.0)).detach(), reduction="mean"
+ ).item()
if self.use_adaptive_entropy:
# Update the coefficient once per optimizer step, not per micro-batch
if self.accelerator.sync_gradients:
From 9b70a4a7986234deabd4fd5480d10340c68bd8f3 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:05:23 +0000
Subject: [PATCH 09/37] Fix Liger skips adaptive entropy guard
---
trl/trainer/grpo_trainer.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 250a7650153..89d2b1e1044 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -670,7 +670,7 @@ def __init__(
)
self.entropy_coef = args.entropy_coef
self.use_adaptive_entropy = args.use_adaptive_entropy
- if self.use_liger_kernel and self.entropy_coef != 0.0:
+ if self.use_liger_kernel and (self.entropy_coef != 0.0 or self.use_adaptive_entropy):
raise NotImplementedError("Entropy bonus is not supported with Liger kernel.")
# Datasets
From 9d79e4a8e6896a5ad40b967e0ff8f62eb1c95e1c Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:49:03 +0000
Subject: [PATCH 10/37] Fix inconsistent inequality
---
trl/trainer/grpo_trainer.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 89d2b1e1044..b4f18fe59b7 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2766,7 +2766,7 @@ def _compute_loss(self, model, inputs):
if self.use_adaptive_entropy:
# Update the coefficient once per optimizer step, not per micro-batch
if self.accelerator.sync_gradients:
- if world_entropy < self.args.entropy_target:
+ if world_entropy <= self.args.entropy_target:
self.entropy_coef = min(
self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max
)
From 46c8a64f6ba5ad156cbcbc24a1a2a7f2d4b89575 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:55:09 +0000
Subject: [PATCH 11/37] Fix mean reduction with sum-count-divide
---
trl/trainer/grpo_trainer.py | 12 +++++++-----
1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index b4f18fe59b7..b93369fc27e 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2758,11 +2758,13 @@ def _compute_loss(self, model, inputs):
elif self.loss_type in ["cispo", "dapo", "vespo"]:
entropy_loss = (entropies * mask).sum() / normalizer
- # Mean per-token entropy in nats across ranks — computed independently of the loss
- # normalizer so its scale matches entropy_target (loss-scaled entropy_loss would not).
- world_entropy = self.accelerator.reduce(
- ((entropies * mask).sum() / mask.sum().clamp(min=1.0)).detach(), reduction="mean"
- ).item()
+ # True global mean per-token entropy (nats): reduce sum and token count jointly so
+ # that ranks with fewer tokens don't get equal weight (averaging per-rank means would
+ # be biased when completion lengths differ across ranks).
+ entropy_stats = self.accelerator.reduce(
+ torch.stack([(entropies * mask).sum(), mask.sum()]).detach(), reduction="sum"
+ )
+ world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item()
if self.use_adaptive_entropy:
# Update the coefficient once per optimizer step, not per micro-batch
if self.accelerator.sync_gradients:
From 3f7a6692080ae0f0b6e64804aa4ef8b8f64522ac Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 14:19:33 +0000
Subject: [PATCH 12/37] Set _last_world_entropy at init
---
trl/trainer/grpo_trainer.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index b93369fc27e..b1727d58d62 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -670,6 +670,9 @@ def __init__(
)
self.entropy_coef = args.entropy_coef
self.use_adaptive_entropy = args.use_adaptive_entropy
+ # Cached entropy from the last optimizer step; inf so the first accumulation window
+ # applies no bonus until a real measurement arrives (conservative default).
+ self._last_world_entropy = float("inf")
if self.use_liger_kernel and (self.entropy_coef != 0.0 or self.use_adaptive_entropy):
raise NotImplementedError("Entropy bonus is not supported with Liger kernel.")
From a05c97907367a3f0e7754d6b0189b57f1183b1be Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 14:21:15 +0000
Subject: [PATCH 13/37] Cache world_entropy at sync point and use that cached
value for apply_coef
---
trl/trainer/grpo_trainer.py | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index b1727d58d62..0d00f40257b 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2769,7 +2769,10 @@ def _compute_loss(self, model, inputs):
)
world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item()
if self.use_adaptive_entropy:
- # Update the coefficient once per optimizer step, not per micro-batch
+ # Update coefficient and cache entropy once per optimizer step, not per micro-batch.
+ # apply_coef uses the cached value so all micro-batches within one accumulation
+ # window apply the same bonus (using per-micro-batch world_entropy would cause
+ # the bonus to toggle on/off unpredictably across accumulation steps).
if self.accelerator.sync_gradients:
if world_entropy <= self.args.entropy_target:
self.entropy_coef = min(
@@ -2779,7 +2782,8 @@ def _compute_loss(self, model, inputs):
self.entropy_coef = max(
self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min
)
- apply_coef = self.entropy_coef if world_entropy <= self.args.entropy_target else 0.0
+ self._last_world_entropy = world_entropy
+ apply_coef = self.entropy_coef if self._last_world_entropy <= self.args.entropy_target else 0.0
else:
apply_coef = self.entropy_coef
From fe03dd1037fb4c79e25ae2bbe9dee686db17ccad Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 14:30:58 +0000
Subject: [PATCH 14/37] Persist also _last_world_entropy
---
trl/trainer/grpo_trainer.py | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 0d00f40257b..7ace9470a58 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2947,7 +2947,7 @@ def _save_checkpoint(self, model, trial):
checkpoint_folder = f"checkpoint-{self.state.global_step}"
output_dir = os.path.join(self.args.output_dir, checkpoint_folder)
with open(os.path.join(output_dir, "entropy_ctrl_state.json"), "w") as f:
- json.dump({"entropy_coef": self.entropy_coef}, f)
+ json.dump({"entropy_coef": self.entropy_coef, "last_world_entropy": self._last_world_entropy}, f)
def _load_optimizer_and_scheduler(self, checkpoint):
super()._load_optimizer_and_scheduler(checkpoint)
@@ -2955,4 +2955,6 @@ def _load_optimizer_and_scheduler(self, checkpoint):
path = os.path.join(checkpoint, "entropy_ctrl_state.json")
if os.path.exists(path):
with open(path) as f:
- self.entropy_coef = json.load(f)["entropy_coef"]
+ state = json.load(f)
+ self.entropy_coef = state["entropy_coef"]
+ self._last_world_entropy = state.get("last_world_entropy", float("inf"))
From f099349fa7d7d2fc5a74868b6f56e1b70b33d9ed Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 14:31:24 +0000
Subject: [PATCH 15/37] Add paper_index entry
---
docs/source/paper_index.md | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/docs/source/paper_index.md b/docs/source/paper_index.md
index e8491893389..ea85a7b8594 100644
--- a/docs/source/paper_index.md
+++ b/docs/source/paper_index.md
@@ -225,6 +225,27 @@ training_args = GRPOConfig(
)
```
+### Skywork-OR1: Open Reasoning Models
+
+**📜 Paper**: https://huggingface.co/papers/2505.22312
+
+Skywork-OR1 is a family of open reasoning models trained with GRPO. The paper introduces **adaptive entropy control**: an entropy regularization term `−α·H(π_θ)` is added to the GRPO objective, and the coefficient `α` is automatically adjusted each optimizer step. When the model's mean per-token entropy falls at or below a target, `α` is incremented to encourage more exploration; otherwise it is decremented. The bonus is only applied while entropy is at or below the target. To replicate this adaptive entropy control, use the following configuration:
+
+```python
+from trl import GRPOConfig, GRPOTrainer
+
+training_args = GRPOConfig(
+ use_adaptive_entropy=True, # enable adaptive entropy control (Section 3.3 of the paper)
+ entropy_coef=0.01, # initial entropy regularization coefficient
+ entropy_target=5.0, # target mean per-token entropy (nats); tune for your model
+ entropy_coef_delta=0.005, # step size for coefficient updates per optimizer step
+)
+trainer = GRPOTrainer(
+ ...,
+ args=training_args,
+)
+```
+
### Beyond the 80/20 Rule: High-Entropy Minority Tokens Drive Effective Reinforcement Learning for LLM Reasoning
**📜 Paper**: https://huggingface.co/papers/2506.01939
From 5288cd5e2987a0952e17d9760346e66715dec537 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 13:18:06 +0000
Subject: [PATCH 16/37] Capture the pure policy loss before normalization
---
trl/trainer/grpo_trainer.py | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 7ace9470a58..9c0c34a46a4 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2724,29 +2724,31 @@ def _compute_loss(self, model, inputs):
if self.loss_type in ["grpo", "sapo"]:
loss = ((per_token_loss * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean()
normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0 # no accum in eval
+ policy_loss = loss.detach()
loss = loss / normalizer
elif self.loss_type == "bnpo":
loss = (per_token_loss * mask).sum() / mask.sum().clamp(min=1.0)
normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0 # no accum in eval
+ policy_loss = loss.detach()
loss = loss / normalizer
elif self.loss_type == "dr_grpo":
loss = (per_token_loss * mask).sum() / (per_token_loss.size(0) * self.max_completion_length)
normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0 # no accum in eval
+ policy_loss = loss.detach()
loss = loss / normalizer
elif self.loss_type in ["cispo", "dapo", "vespo"]:
normalizer = inputs["num_items_in_batch"] / self.accelerator.num_processes
loss = (per_token_loss * mask).sum() / normalizer
+ policy_loss = loss.detach()
elif self.loss_type == "luspo":
# Unless importance_sampling_level="token" (not recommended here), per_token_loss is expected to be (B, 1)
loss = (per_token_loss * mask.sum(1, keepdim=True)).mean()
normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0
+ policy_loss = loss.detach()
loss = loss / normalizer
else:
raise ValueError(f"Unknown loss type: {self.loss_type}")
- # Capture the pure policy loss for logging before entropy/aux modify it
- policy_loss = loss.detach()
-
# Entropy bonus: add entropy regularization to encourage exploration.
# Gate: run whenever a non-zero static coef is set OR adaptive mode is enabled. Adaptive must always run even
# when self.entropy_coef has been decremented to entropy_coef_min (default 0) so it can recover once entropy
From 03f4208c02b3ae854ef8378360d20b340590d9d9 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 13:18:28 +0000
Subject: [PATCH 17/37] Fix luspo loss
---
trl/trainer/grpo_trainer.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 9c0c34a46a4..5c06e0bc368 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2754,7 +2754,7 @@ def _compute_loss(self, model, inputs):
# when self.entropy_coef has been decremented to entropy_coef_min (default 0) so it can recover once entropy
# drops below entropy_target again.
if self.entropy_coef != 0.0 or self.use_adaptive_entropy:
- if self.loss_type in ["grpo", "sapo", "luspo"]:
+ if self.loss_type in ["grpo", "sapo"]:
entropy_loss = ((entropies * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() / normalizer
elif self.loss_type == "bnpo":
entropy_loss = (entropies * mask).sum() / mask.sum().clamp(min=1.0) / normalizer
@@ -2762,6 +2762,9 @@ def _compute_loss(self, model, inputs):
entropy_loss = (entropies * mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer
elif self.loss_type in ["cispo", "dapo", "vespo"]:
entropy_loss = (entropies * mask).sum() / normalizer
+ elif self.loss_type == "luspo":
+ # luspo weights each sequence by its token count, so entropy is summed (not per-token averaged) per sequence
+ entropy_loss = (entropies * mask).sum(-1).mean() / normalizer
# True global mean per-token entropy (nats): reduce sum and token count jointly so
# that ranks with fewer tokens don't get equal weight (averaging per-rank means would
From dbc0c7592c6437d0c45b2cdc78043e90c7cdc75d Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 13:19:43 +0000
Subject: [PATCH 18/37] Gate policy_loss logging and align style
---
trl/trainer/grpo_trainer.py | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 5c06e0bc368..db4926a71ec 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2794,6 +2794,10 @@ def _compute_loss(self, model, inputs):
loss = loss - apply_coef * entropy_loss
+ self._metrics[mode]["policy_loss"].append(self.accelerator.gather(policy_loss).nanmean().item())
+ self._metrics[mode]["entropy_loss"].append(world_entropy)
+ self._metrics[mode]["entropy_coef"].append(self.entropy_coef)
+
# The policy loss above is scaled for gradient accumulation (HF auto-scaling is off here), so scale aux too
if self.aux_loss_enabled:
normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0
@@ -2809,11 +2813,6 @@ def masked_batch_mean(x):
else:
return (x * mask).sum() / completion_token_count
- self._metrics[mode]["policy_loss"].append(self.accelerator.reduce(policy_loss, reduction="mean").item())
- if self.entropy_coef != 0.0 or self.use_adaptive_entropy:
- self._metrics[mode]["entropy_loss"].append(world_entropy)
- self._metrics[mode]["entropy_coef"].append(self.entropy_coef)
-
if self.beta != 0.0:
mean_kl = masked_batch_mean(per_token_kl)
self._metrics[mode]["kl"].append(self.accelerator.gather(mean_kl).nanmean().item())
From 506fbf9f4cd8b838b90c6ed6982ae432c7baf70c Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 14:01:00 +0000
Subject: [PATCH 19/37] Fix entropy state written to wrong path
---
trl/trainer/grpo_trainer.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index aeef9676067..1576425522d 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2959,7 +2959,7 @@ def _save_checkpoint(self, model, trial):
super()._save_checkpoint(model, trial)
if self.use_adaptive_entropy and self.is_world_process_zero():
checkpoint_folder = f"checkpoint-{self.state.global_step}"
- output_dir = os.path.join(self.args.output_dir, checkpoint_folder)
+ output_dir = os.path.join(self._get_output_dir(trial=trial), checkpoint_folder)
with open(os.path.join(output_dir, "entropy_ctrl_state.json"), "w") as f:
json.dump({"entropy_coef": self.entropy_coef, "last_world_entropy": self._last_world_entropy}, f)
From 8a6b53dde71d9340a2491bbc83744fca9e35484d Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 14:01:43 +0000
Subject: [PATCH 20/37] Fix is_world_process_zero() vs args.should_save guard
mismatch
---
trl/trainer/grpo_trainer.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 1576425522d..e6f60c1831f 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2957,7 +2957,7 @@ def _save_checkpoint(self, model, trial):
model_name = self.args.hub_model_id.split("/")[-1]
self.create_model_card(model_name=model_name)
super()._save_checkpoint(model, trial)
- if self.use_adaptive_entropy and self.is_world_process_zero():
+ if self.use_adaptive_entropy and self.args.should_save:
checkpoint_folder = f"checkpoint-{self.state.global_step}"
output_dir = os.path.join(self._get_output_dir(trial=trial), checkpoint_folder)
with open(os.path.join(output_dir, "entropy_ctrl_state.json"), "w") as f:
From 474b30c479c795403a041439e781c66d9974e345 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 14:03:03 +0000
Subject: [PATCH 21/37] Update docs: policy_loss only logged inside entropy
block
---
docs/source/grpo_trainer.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
index 7f19eb03cc1..bdd366ee3d5 100644
--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@@ -185,7 +185,7 @@ While training and evaluating, we record the following reward metrics:
- `reward`: The overall average reward after summing rewards across functions (weighted by `reward_weights`).
- `reward_std`: The standard deviation of summed rewards across functions (weighted by `reward_weights`), computed over the full batch.
- `frac_reward_zero_std`: The fraction of samples in the generation batch with a reward std of zero, implying there is little diversity for that prompt (all answers are correct or incorrect).
-- `policy_loss`: The policy gradient loss value (before any entropy bonus).
+- `policy_loss`: The policy gradient loss value (before any entropy bonus). Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`.
- `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.)
- `entropy_loss`: Mean per-token entropy (nats) used as the regularization signal. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`.
- `entropy_coef`: The current entropy coefficient. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. Updated once per optimizer step when `use_adaptive_entropy=True`.
From a0b9ec68aa11fe3399dc503796cc4296d3240b8e Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 14:04:37 +0000
Subject: [PATCH 22/37] Log entropy_coef only when sync_gradients=True
---
trl/trainer/grpo_trainer.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index e6f60c1831f..03bbbdda207 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2806,7 +2806,10 @@ def _compute_loss(self, model, inputs):
self._metrics[mode]["policy_loss"].append(self.accelerator.gather(policy_loss).nanmean().item())
self._metrics[mode]["entropy_loss"].append(world_entropy)
- self._metrics[mode]["entropy_coef"].append(self.entropy_coef)
+ # Log entropy_coef only on optimizer-step boundaries: it updates once per step (sync_gradients),
+ # so logging K identical values per step would dilute the metric with stale data.
+ if self.accelerator.sync_gradients:
+ self._metrics[mode]["entropy_coef"].append(self.entropy_coef)
# The policy loss above is scaled for gradient accumulation (HF auto-scaling is off here), so scale aux too
if self.aux_loss_enabled:
From 608b1e0a8772697eebf4b442dbebddd1cdde80eb Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 14:06:30 +0000
Subject: [PATCH 23/37] Add guard for entropy-loss dispatch matching
policy-loss dispatch
---
trl/trainer/grpo_trainer.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 03bbbdda207..387eacd15fb 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2775,6 +2775,8 @@ def _compute_loss(self, model, inputs):
elif self.loss_type == "luspo":
# luspo weights each sequence by its token count, so entropy is summed (not per-token averaged) per sequence
entropy_loss = (entropies * mask).sum(-1).mean() / normalizer
+ else:
+ raise ValueError(f"Unknown loss type: {self.loss_type}")
# True global mean per-token entropy (nats): reduce sum and token count jointly so
# that ranks with fewer tokens don't get equal weight (averaging per-rank means would
From 81841ad99cb4e0f94ddc2fbffaf5b66286a674f8 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 14:18:38 +0000
Subject: [PATCH 24/37] Remove entropy_loss
---
docs/source/grpo_trainer.md | 3 +--
tests/test_grpo_trainer.py | 2 --
trl/trainer/grpo_trainer.py | 20 ++++++++------------
3 files changed, 9 insertions(+), 16 deletions(-)
diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
index bdd366ee3d5..e6fb82f1f88 100644
--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@@ -187,8 +187,7 @@ While training and evaluating, we record the following reward metrics:
- `frac_reward_zero_std`: The fraction of samples in the generation batch with a reward std of zero, implying there is little diversity for that prompt (all answers are correct or incorrect).
- `policy_loss`: The policy gradient loss value (before any entropy bonus). Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`.
- `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.)
-- `entropy_loss`: Mean per-token entropy (nats) used as the regularization signal. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`.
-- `entropy_coef`: The current entropy coefficient. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. Updated once per optimizer step when `use_adaptive_entropy=True`.
+- `entropy_coef`: The current entropy regularization coefficient. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. Updated once per optimizer step when `use_adaptive_entropy=True`.
- `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero.
- `clip_ratio/region_mean`: The ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities where the GRPO objective is clipped to stay within the trust region: \\( \text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \quad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})} \\). A higher value means more tokens are clipped, which constrains how much the policy $\pi_\theta$ can change.
- `clip_ratio/low_mean`: The average ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the lower bound of the trust region: \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\).
diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py
index 186a815d2eb..29f07aff5ff 100644
--- a/tests/test_grpo_trainer.py
+++ b/tests/test_grpo_trainer.py
@@ -1498,7 +1498,6 @@ def test_train_with_static_entropy(self):
assert trainer.state.log_history[-1]["train_loss"] is not None
assert trainer.state.log_history[-1]["policy_loss"] is not None
- assert trainer.state.log_history[-1]["entropy_loss"] is not None
assert trainer.state.log_history[-1]["entropy_coef"] is not None
# Check that the params have changed
@@ -1532,7 +1531,6 @@ def test_train_with_adaptive_entropy(self):
assert trainer.state.log_history[-1]["train_loss"] is not None
assert trainer.state.log_history[-1]["policy_loss"] is not None
- assert trainer.state.log_history[-1]["entropy_loss"] is not None
assert trainer.state.log_history[-1]["entropy_coef"] is not None
# Coefficient should have increased since entropy < target throughout training
assert trainer.entropy_coef > 0.01
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 387eacd15fb..21fadcc16b6 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2778,18 +2778,15 @@ def _compute_loss(self, model, inputs):
else:
raise ValueError(f"Unknown loss type: {self.loss_type}")
- # True global mean per-token entropy (nats): reduce sum and token count jointly so
- # that ranks with fewer tokens don't get equal weight (averaging per-rank means would
- # be biased when completion lengths differ across ranks).
- entropy_stats = self.accelerator.reduce(
- torch.stack([(entropies * mask).sum(), mask.sum()]).detach(), reduction="sum"
- )
- world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item()
if self.use_adaptive_entropy:
- # Update coefficient and cache entropy once per optimizer step, not per micro-batch.
- # apply_coef uses the cached value so all micro-batches within one accumulation
- # window apply the same bonus (using per-micro-batch world_entropy would cause
- # the bonus to toggle on/off unpredictably across accumulation steps).
+ # Reduce sum and token count jointly for a true global mean (unbiased when ranks
+ # have different completion lengths). Update coefficient and cache entropy once per
+ # optimizer step; apply_coef uses the cached value so all micro-batches within one
+ # accumulation window apply the same bonus.
+ entropy_stats = self.accelerator.reduce(
+ torch.stack([(entropies * mask).sum(), mask.sum()]).detach(), reduction="sum"
+ )
+ world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item()
if self.accelerator.sync_gradients:
if world_entropy <= self.args.entropy_target:
self.entropy_coef = min(
@@ -2807,7 +2804,6 @@ def _compute_loss(self, model, inputs):
loss = loss - apply_coef * entropy_loss
self._metrics[mode]["policy_loss"].append(self.accelerator.gather(policy_loss).nanmean().item())
- self._metrics[mode]["entropy_loss"].append(world_entropy)
# Log entropy_coef only on optimizer-step boundaries: it updates once per step (sync_gradients),
# so logging K identical values per step would dilute the metric with stale data.
if self.accelerator.sync_gradients:
From bee5126842614b1d3fbdf0df01df498d95ca5c3b Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 14:22:53 +0000
Subject: [PATCH 25/37] Gate on train mode to avoid entropy state update during
eval
---
trl/trainer/grpo_trainer.py | 44 +++++++++++++++++++------------------
1 file changed, 23 insertions(+), 21 deletions(-)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 21fadcc16b6..bebb43a4481 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2779,24 +2779,26 @@ def _compute_loss(self, model, inputs):
raise ValueError(f"Unknown loss type: {self.loss_type}")
if self.use_adaptive_entropy:
- # Reduce sum and token count jointly for a true global mean (unbiased when ranks
- # have different completion lengths). Update coefficient and cache entropy once per
- # optimizer step; apply_coef uses the cached value so all micro-batches within one
- # accumulation window apply the same bonus.
- entropy_stats = self.accelerator.reduce(
- torch.stack([(entropies * mask).sum(), mask.sum()]).detach(), reduction="sum"
- )
- world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item()
- if self.accelerator.sync_gradients:
- if world_entropy <= self.args.entropy_target:
- self.entropy_coef = min(
- self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max
- )
- else:
- self.entropy_coef = max(
- self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min
- )
- self._last_world_entropy = world_entropy
+ if mode == "train":
+ # Reduce sum and token count jointly for a true global mean (unbiased when ranks
+ # have different completion lengths). Update coefficient and cache entropy once per
+ # optimizer step; apply_coef uses the cached value so all micro-batches within one
+ # accumulation window apply the same bonus. Gated on train mode so evaluation
+ # cannot mutate the entropy controller state.
+ entropy_stats = self.accelerator.reduce(
+ torch.stack([(entropies * mask).sum(), mask.sum()]).detach(), reduction="sum"
+ )
+ world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item()
+ if self.accelerator.sync_gradients:
+ if world_entropy <= self.args.entropy_target:
+ self.entropy_coef = min(
+ self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max
+ )
+ else:
+ self.entropy_coef = max(
+ self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min
+ )
+ self._last_world_entropy = world_entropy
apply_coef = self.entropy_coef if self._last_world_entropy <= self.args.entropy_target else 0.0
else:
apply_coef = self.entropy_coef
@@ -2804,9 +2806,9 @@ def _compute_loss(self, model, inputs):
loss = loss - apply_coef * entropy_loss
self._metrics[mode]["policy_loss"].append(self.accelerator.gather(policy_loss).nanmean().item())
- # Log entropy_coef only on optimizer-step boundaries: it updates once per step (sync_gradients),
- # so logging K identical values per step would dilute the metric with stale data.
- if self.accelerator.sync_gradients:
+ # Log entropy_coef only on train optimizer-step boundaries: it updates once per step
+ # (sync_gradients), and sync_gradients is always True in eval (no accumulation context).
+ if mode == "train" and self.accelerator.sync_gradients:
self._metrics[mode]["entropy_coef"].append(self.entropy_coef)
# The policy loss above is scaled for gradient accumulation (HF auto-scaling is off here), so scale aux too
From 2f34d156c5cc069fa5a2b7d58b1ce7b6b0c079b8 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 14:57:02 +0000
Subject: [PATCH 26/37] Fix entropy bonus ignores quantile mask
---
trl/trainer/grpo_trainer.py | 15 ++++++++++-----
1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index bebb43a4481..9b1a5a1c79e 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2764,17 +2764,22 @@ def _compute_loss(self, model, inputs):
# when self.entropy_coef has been decremented to entropy_coef_min (default 0) so it can recover once entropy
# drops below entropy_target again.
if self.entropy_coef != 0.0 or self.use_adaptive_entropy:
+ # When top_entropy_quantile < 1.0, entropy_mask restricts policy gradients to high-entropy
+ # tokens. Use the same effective mask for the entropy bonus so it acts on the same tokens.
+ effective_mask = mask if entropy_mask is None else mask * entropy_mask
if self.loss_type in ["grpo", "sapo"]:
- entropy_loss = ((entropies * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() / normalizer
+ entropy_loss = ((entropies * effective_mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() / normalizer
elif self.loss_type == "bnpo":
- entropy_loss = (entropies * mask).sum() / mask.sum().clamp(min=1.0) / normalizer
+ entropy_loss = (entropies * effective_mask).sum() / mask.sum().clamp(min=1.0) / normalizer
elif self.loss_type == "dr_grpo":
- entropy_loss = (entropies * mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer
+ entropy_loss = (
+ (entropies * effective_mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer
+ )
elif self.loss_type in ["cispo", "dapo", "vespo"]:
- entropy_loss = (entropies * mask).sum() / normalizer
+ entropy_loss = (entropies * effective_mask).sum() / normalizer
elif self.loss_type == "luspo":
# luspo weights each sequence by its token count, so entropy is summed (not per-token averaged) per sequence
- entropy_loss = (entropies * mask).sum(-1).mean() / normalizer
+ entropy_loss = (entropies * effective_mask).sum(-1).mean() / normalizer
else:
raise ValueError(f"Unknown loss type: {self.loss_type}")
From 806078dda8370ee5479fcb5d50730ed3c5c826a4 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 15:05:48 +0000
Subject: [PATCH 27/37] Use effective_mask for the world_entropy all-reduce too
---
trl/trainer/grpo_trainer.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 9b1a5a1c79e..8107d5827b8 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2791,7 +2791,8 @@ def _compute_loss(self, model, inputs):
# accumulation window apply the same bonus. Gated on train mode so evaluation
# cannot mutate the entropy controller state.
entropy_stats = self.accelerator.reduce(
- torch.stack([(entropies * mask).sum(), mask.sum()]).detach(), reduction="sum"
+ torch.stack([(entropies * effective_mask).sum(), effective_mask.sum()]).detach(),
+ reduction="sum",
)
world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item()
if self.accelerator.sync_gradients:
From 2845ef455f3132a1caad88dc0bb6bca0a799f147 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 15:06:11 +0000
Subject: [PATCH 28/37] Update docs
---
docs/source/grpo_trainer.md | 2 +-
trl/trainer/grpo_config.py | 14 +++++++++-----
2 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
index e6fb82f1f88..0cb882a6846 100644
--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@@ -677,7 +677,7 @@ training_args = GRPOConfig(
-Typical language models have per-token entropies of 2–10 nats. The default `entropy_target=0.2` nearly always triggers regularization; set it to a value meaningful for your model (e.g. the entropy you observe early in training, logged as the `entropy` metric).
+Typical language models have per-token entropies of 2–10 nats. The default `entropy_target=0.2` nearly always triggers regularization; set it to a value meaningful for your model (e.g. the entropy you observe early in training, logged as the `entropy` metric). When using `top_entropy_quantile < 1.0`, `entropy_target` applies to the high-entropy token subset — that subset's entropy will be higher than the logged full-token `entropy`, so calibrate accordingly.
diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py
index 230c8f50c10..a83dd7c0349 100644
--- a/trl/trainer/grpo_config.py
+++ b/trl/trainer/grpo_config.py
@@ -314,9 +314,11 @@ class GRPOConfig(_BaseConfig):
Step size for adjusting the entropy coefficient at each optimizer step during adaptive entropy control.
entropy_target (`float`, *optional*, defaults to `0.2`):
Target mean per-token entropy (in nats) used by adaptive entropy control. The coefficient is only
- applied when the current entropy falls at or below this value. Typical language models have per-token
- entropies in the range 2–10 nats; the default of `0.2` nearly always triggers regularization, so users
- should tune this to a value appropriate for their model and task.
+ applied when the current entropy falls at or below this value. Measured over the same token set as
+ the policy loss: all completion tokens by default, or only the high-entropy subset when
+ `top_entropy_quantile < 1.0`. Typical language models have per-token entropies in the range 2–10
+ nats; the default of `0.2` nearly always triggers regularization, so users should tune this to a
+ value appropriate for their model and task (and token subset when using `top_entropy_quantile`).
max_tool_calling_iterations (`int`, *optional*):
Maximum number of tool-calling turns when training an agent. If `None`, there is no limit and generation
stops when the model generates a response turn with no tool calls or when the total response length reaches
@@ -886,8 +888,10 @@ class GRPOConfig(_BaseConfig):
default=0.2,
metadata={
"help": "Target mean per-token entropy (nats) for adaptive entropy control. The coefficient is only "
- "applied when current entropy is at or below this value. Typical language models have per-token "
- "entropies of 2–10 nats; the default of 0.2 nearly always triggers regularization, so tune this."
+ "applied when current entropy is at or below this value. Measured over the same token set as the "
+ "policy loss (all completion tokens, or the high-entropy subset when top_entropy_quantile < 1.0). "
+ "Typical language models have per-token entropies of 2–10 nats; the default of 0.2 nearly always "
+ "triggers regularization, so tune this."
},
)
max_tool_calling_iterations: int | None = field(
From 2ed11c0a7d9f43d65364dd1b20a05a66f0c4d87b Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 15:25:11 +0000
Subject: [PATCH 29/37] Use unified formula with mean per-token entropy of
active tokens
---
trl/trainer/grpo_trainer.py | 19 ++++---------------
1 file changed, 4 insertions(+), 15 deletions(-)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 8107d5827b8..305fcbb375e 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2767,21 +2767,10 @@ def _compute_loss(self, model, inputs):
# When top_entropy_quantile < 1.0, entropy_mask restricts policy gradients to high-entropy
# tokens. Use the same effective mask for the entropy bonus so it acts on the same tokens.
effective_mask = mask if entropy_mask is None else mask * entropy_mask
- if self.loss_type in ["grpo", "sapo"]:
- entropy_loss = ((entropies * effective_mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() / normalizer
- elif self.loss_type == "bnpo":
- entropy_loss = (entropies * effective_mask).sum() / mask.sum().clamp(min=1.0) / normalizer
- elif self.loss_type == "dr_grpo":
- entropy_loss = (
- (entropies * effective_mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer
- )
- elif self.loss_type in ["cispo", "dapo", "vespo"]:
- entropy_loss = (entropies * effective_mask).sum() / normalizer
- elif self.loss_type == "luspo":
- # luspo weights each sequence by its token count, so entropy is summed (not per-token averaged) per sequence
- entropy_loss = (entropies * effective_mask).sum(-1).mean() / normalizer
- else:
- raise ValueError(f"Unknown loss type: {self.loss_type}")
+ # Mean per-active-token entropy, scaled for gradient accumulation like the policy loss.
+ # Uniform across all loss types so entropy_target and entropy_coef have consistent units
+ # (per-token nats) and match the world_entropy computed in the adaptive block below.
+ entropy_loss = (entropies * effective_mask).sum() / effective_mask.sum().clamp(min=1.0) / normalizer
if self.use_adaptive_entropy:
if mode == "train":
From 76255d3016e79ceace966c6bf50e6e3ff60b448f Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 25 Jun 2026 14:52:43 +0200
Subject: [PATCH 30/37] Make three-branch entropy-loss split
---
trl/trainer/grpo_trainer.py | 19 +++++++++++++++----
1 file changed, 15 insertions(+), 4 deletions(-)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 50e8337e77b..280ea1eb446 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2767,10 +2767,21 @@ def _compute_loss(self, model, inputs):
# When top_entropy_quantile < 1.0, entropy_mask restricts policy gradients to high-entropy
# tokens. Use the same effective mask for the entropy bonus so it acts on the same tokens.
effective_mask = mask if entropy_mask is None else mask * entropy_mask
- # Mean per-active-token entropy, scaled for gradient accumulation like the policy loss.
- # Uniform across all loss types so entropy_target and entropy_coef have consistent units
- # (per-token nats) and match the world_entropy computed in the adaptive block below.
- entropy_loss = (entropies * effective_mask).sum() / effective_mask.sum().clamp(min=1.0) / normalizer
+ # The entropy bonus must be normalized exactly like each loss type's policy loss, so that
+ # entropy_coef stays on a consistent scale and gradient accumulation remains correct. The
+ # normalizer differs by loss type: it is the gradient accumulation step count for the grpo
+ # family, but a global token count for the cispo/dapo/vespo family.
+ if self.loss_type in ["cispo", "dapo", "vespo"]:
+ # normalizer is a global token count, so summing the entropies accumulates over the
+ # optimizer step to the global token-weighted mean entropy, matching world_entropy below.
+ entropy_loss = (entropies * effective_mask).sum() / normalizer
+ elif self.loss_type == "luspo":
+ # luspo weights each sequence by its token count, so entropy is summed per sequence (not
+ # per-token averaged) to stay on the same scale as the policy loss.
+ entropy_loss = (entropies * effective_mask).sum(-1).mean() / normalizer
+ else: # grpo, sapo, bnpo, dr_grpo: normalizer is the gradient accumulation step count
+ # Token-weighted mean entropy of active tokens, matching world_entropy below.
+ entropy_loss = (entropies * effective_mask).sum() / effective_mask.sum().clamp(min=1.0) / normalizer
if self.use_adaptive_entropy:
if mode == "train":
From fc76d4b497dab5310be5b222945162851041fb18 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 25 Jun 2026 14:53:49 +0200
Subject: [PATCH 31/37] Compute bonus from frozen state, update per optimizer
step
---
trl/trainer/grpo_trainer.py | 51 ++++++++++++++++++++-----------------
1 file changed, 28 insertions(+), 23 deletions(-)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 280ea1eb446..886e9064a87 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2783,28 +2783,10 @@ def _compute_loss(self, model, inputs):
# Token-weighted mean entropy of active tokens, matching world_entropy below.
entropy_loss = (entropies * effective_mask).sum() / effective_mask.sum().clamp(min=1.0) / normalizer
+ # Apply the coefficient and gating from the end of the previous optimizer step, so that every
+ # micro-batch in the current accumulation window applies the same entropy bonus. The adaptive
+ # update below only takes effect on the next step.
if self.use_adaptive_entropy:
- if mode == "train":
- # Reduce sum and token count jointly for a true global mean (unbiased when ranks
- # have different completion lengths). Update coefficient and cache entropy once per
- # optimizer step; apply_coef uses the cached value so all micro-batches within one
- # accumulation window apply the same bonus. Gated on train mode so evaluation
- # cannot mutate the entropy controller state.
- entropy_stats = self.accelerator.reduce(
- torch.stack([(entropies * effective_mask).sum(), effective_mask.sum()]).detach(),
- reduction="sum",
- )
- world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item()
- if self.accelerator.sync_gradients:
- if world_entropy <= self.args.entropy_target:
- self.entropy_coef = min(
- self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max
- )
- else:
- self.entropy_coef = max(
- self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min
- )
- self._last_world_entropy = world_entropy
apply_coef = self.entropy_coef if self._last_world_entropy <= self.args.entropy_target else 0.0
else:
apply_coef = self.entropy_coef
@@ -2812,8 +2794,31 @@ def _compute_loss(self, model, inputs):
loss = loss - apply_coef * entropy_loss
self._metrics[mode]["policy_loss"].append(self.accelerator.gather(policy_loss).nanmean().item())
- # Log entropy_coef only on train optimizer-step boundaries: it updates once per step
- # (sync_gradients), and sync_gradients is always True in eval (no accumulation context).
+
+ # Adaptive update: once per optimizer step, measure the global token-weighted entropy and adjust
+ # the coefficient for the next step. Gated on train mode so evaluation cannot mutate the entropy
+ # controller state, and on sync_gradients so the all-reduce runs once per optimizer step rather
+ # than on every micro-batch of the accumulation window.
+ if self.use_adaptive_entropy and mode == "train" and self.accelerator.sync_gradients:
+ # Reduce sum and token count jointly for a true global mean (unbiased when ranks have
+ # different completion lengths).
+ entropy_stats = self.accelerator.reduce(
+ torch.stack([(entropies * effective_mask).sum(), effective_mask.sum()]).detach(),
+ reduction="sum",
+ )
+ world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item()
+ if world_entropy <= self.args.entropy_target:
+ self.entropy_coef = min(
+ self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max
+ )
+ else:
+ self.entropy_coef = max(
+ self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min
+ )
+ self._last_world_entropy = world_entropy
+
+ # Log entropy_coef on train optimizer-step boundaries (constant for static control; updated just
+ # above for adaptive control). sync_gradients is always True in eval (no accumulation context).
if mode == "train" and self.accelerator.sync_gradients:
self._metrics[mode]["entropy_coef"].append(self.entropy_coef)
From bed5188839e63bfac5deefd74b7fb39eced5f770 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 25 Jun 2026 15:00:44 +0200
Subject: [PATCH 32/37] Fix "nearly always triggers" docs
---
docs/source/grpo_trainer.md | 2 +-
trl/trainer/grpo_config.py | 11 +++++++----
2 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
index 0cb882a6846..c05cbd389b7 100644
--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@@ -677,7 +677,7 @@ training_args = GRPOConfig(
-Typical language models have per-token entropies of 2–10 nats. The default `entropy_target=0.2` nearly always triggers regularization; set it to a value meaningful for your model (e.g. the entropy you observe early in training, logged as the `entropy` metric). When using `top_entropy_quantile < 1.0`, `entropy_target` applies to the high-entropy token subset — that subset's entropy will be higher than the logged full-token `entropy`, so calibrate accordingly.
+Typical language models have per-token entropies of 2–10 nats, so the default `entropy_target=0.2` almost never triggers regularization — the bonus only engages once entropy is at or below the target, i.e. near-complete collapse. Set it to a value meaningful for your model, e.g. close to the entropy you observe early in training (logged as the `entropy` metric). When using `top_entropy_quantile < 1.0`, `entropy_target` applies to the high-entropy token subset — that subset's entropy will be higher than the logged full-token `entropy`, so calibrate accordingly.
diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py
index a83dd7c0349..5c469aecfdc 100644
--- a/trl/trainer/grpo_config.py
+++ b/trl/trainer/grpo_config.py
@@ -317,8 +317,10 @@ class GRPOConfig(_BaseConfig):
applied when the current entropy falls at or below this value. Measured over the same token set as
the policy loss: all completion tokens by default, or only the high-entropy subset when
`top_entropy_quantile < 1.0`. Typical language models have per-token entropies in the range 2–10
- nats; the default of `0.2` nearly always triggers regularization, so users should tune this to a
- value appropriate for their model and task (and token subset when using `top_entropy_quantile`).
+ nats, so the default of `0.2` almost never triggers regularization (only on near-complete entropy
+ collapse); set it close to the entropy you observe early in training (logged as the `entropy`
+ metric) so the bonus engages before the policy collapses (and account for the token subset when
+ using `top_entropy_quantile`).
max_tool_calling_iterations (`int`, *optional*):
Maximum number of tool-calling turns when training an agent. If `None`, there is no limit and generation
stops when the model generates a response turn with no tool calls or when the total response length reaches
@@ -890,8 +892,9 @@ class GRPOConfig(_BaseConfig):
"help": "Target mean per-token entropy (nats) for adaptive entropy control. The coefficient is only "
"applied when current entropy is at or below this value. Measured over the same token set as the "
"policy loss (all completion tokens, or the high-entropy subset when top_entropy_quantile < 1.0). "
- "Typical language models have per-token entropies of 2–10 nats; the default of 0.2 nearly always "
- "triggers regularization, so tune this."
+ "Typical language models have per-token entropies of 2–10 nats, so the default of 0.2 almost never "
+ "triggers regularization (only on near-complete collapse); set it close to the entropy observed "
+ "early in training and tune from there."
},
)
max_tool_calling_iterations: int | None = field(
From 6e8f498ed10927d40131bfd5e6d4be587bdc23bd Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 25 Jun 2026 15:02:28 +0200
Subject: [PATCH 33/37] Add scale test and grad-accumulation adaptive test
---
tests/test_grpo_trainer.py | 87 ++++++++++++++++++++++++++++++++++++++
1 file changed, 87 insertions(+)
diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py
index 29f07aff5ff..8c816bcb33a 100644
--- a/tests/test_grpo_trainer.py
+++ b/tests/test_grpo_trainer.py
@@ -1540,6 +1540,93 @@ def test_train_with_adaptive_entropy(self):
new_param = trainer.model.get_parameter(n)
assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+ @pytest.mark.parametrize("loss_type", ["grpo", "dapo", "luspo"])
+ def test_entropy_bonus_scale(self, loss_type):
+ # Regression test: the entropy bonus must be normalized like each loss type's policy loss. A previous
+ # "unified" formula divided the per-token mean entropy by the loss normalizer, which for the
+ # cispo/dapo/vespo family is a global token count, making the bonus ~1/sequence_length too small; and
+ # it put luspo's bonus on the per-token scale instead of luspo's sequence-weighted scale. With
+ # gradient_accumulation_steps=1 the per-step entropy contribution to the loss is
+ # contrib = policy_loss - loss = entropy_coef * entropy_loss, so contrib / entropy reveals the scale.
+ entropy_coef = 0.5
+ dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+ training_args = GRPOConfig(
+ output_dir=self.tmp_dir,
+ importance_sampling_level="sequence" if loss_type == "luspo" else "token",
+ learning_rate=0.1, # use higher lr because gradients are tiny and default lr can stall updates
+ per_device_train_batch_size=3, # reduce the batch size to reduce memory usage
+ num_generations=3, # reduce the number of generations to reduce memory usage
+ max_completion_length=16, # long enough that the per-token vs sequence-weighted scales differ
+ gradient_accumulation_steps=1, # so contrib == entropy_coef * entropy_loss holds per step
+ loss_type=loss_type,
+ logging_steps=1,
+ report_to="none",
+ entropy_coef=entropy_coef,
+ )
+ trainer = GRPOTrainer(
+ model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+ reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+ args=training_args,
+ train_dataset=dataset,
+ )
+
+ trainer.train()
+
+ logs = [h for h in trainer.state.log_history if "policy_loss" in h and "loss" in h and h.get("entropy")]
+ assert logs
+ ratios = sorted((h["policy_loss"] - h["loss"]) / h["entropy"] for h in logs)
+ ratio = ratios[len(ratios) // 2] # median, robust to per-step noise
+ if loss_type == "luspo":
+ # luspo weights each sequence by its length, so the bonus is the per-sequence entropy sum: its
+ # scale is entropy_coef * (mean sequence length), well above entropy_coef. The buggy per-token
+ # formula gave ratio == entropy_coef.
+ assert ratio > 1.5 * entropy_coef
+ else:
+ # grpo (and the cispo/dapo/vespo family) regularize the per-token mean entropy, so the bonus is
+ # exactly entropy_coef * entropy. The buggy formula made dapo's ratio smaller by ~1/seq_len.
+ assert ratio == pytest.approx(entropy_coef, rel=0.3)
+
+ def test_train_with_adaptive_entropy_gradient_accumulation(self):
+ # Adaptive entropy must behave correctly under gradient accumulation: the coefficient and gating are
+ # frozen across an accumulation window and the controller updates once per optimizer step (not once
+ # per micro-batch). With entropy_target above any realistic entropy the coefficient is incremented by
+ # entropy_coef_delta on every optimizer step, so the final value pins down the number of updates.
+ dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+ training_args = GRPOConfig(
+ output_dir=self.tmp_dir,
+ learning_rate=0.1, # use higher lr because gradients are tiny and default lr can stall updates
+ per_device_train_batch_size=3, # reduce the batch size to reduce memory usage
+ num_generations=3, # reduce the number of generations to reduce memory usage
+ max_completion_length=8, # reduce the completion length to reduce memory usage
+ gradient_accumulation_steps=2, # exercise the accumulation window
+ report_to="none",
+ entropy_coef=0.01,
+ use_adaptive_entropy=True,
+ entropy_target=15.0, # above any realistic entropy → coef incremented once per optimizer step
+ entropy_coef_delta=0.005,
+ )
+ trainer = GRPOTrainer(
+ model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+ reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+ args=training_args,
+ train_dataset=dataset,
+ )
+
+ previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+ trainer.train()
+
+ assert trainer.state.log_history[-1]["train_loss"] is not None
+ # Exactly one increment per optimizer step (global_step counts optimizer steps, not micro-batches);
+ # a per-micro-batch update would overshoot this.
+ expected_coef = min(0.01 + 0.005 * trainer.state.global_step, 1.0)
+ assert trainer.entropy_coef == pytest.approx(expected_coef, abs=1e-6)
+
+ # Check that the params have changed
+ for n, param in previous_trainable_params.items():
+ new_param = trainer.model.get_parameter(n)
+ assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+
def test_train_with_entropy_filter(self):
dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
training_args = GRPOConfig(
From 607d911d95cfcdd631f8164fb307b4ca433a7a6c Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 25 Jun 2026 15:47:33 +0200
Subject: [PATCH 34/37] Fix dr_grpo entropy scale mismatch
---
trl/trainer/grpo_trainer.py | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 886e9064a87..c2e4b211c8c 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2775,11 +2775,18 @@ def _compute_loss(self, model, inputs):
# normalizer is a global token count, so summing the entropies accumulates over the
# optimizer step to the global token-weighted mean entropy, matching world_entropy below.
entropy_loss = (entropies * effective_mask).sum() / normalizer
+ elif self.loss_type == "dr_grpo":
+ # Dr. GRPO normalizes by the fixed budget (batch size × max completion length) instead of the
+ # actual token count, to remove length bias; scale the entropy bonus the same way so that
+ # entropy_coef stays consistent with the policy term when completions are shorter than the max.
+ entropy_loss = (
+ (entropies * effective_mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer
+ )
elif self.loss_type == "luspo":
# luspo weights each sequence by its token count, so entropy is summed per sequence (not
# per-token averaged) to stay on the same scale as the policy loss.
entropy_loss = (entropies * effective_mask).sum(-1).mean() / normalizer
- else: # grpo, sapo, bnpo, dr_grpo: normalizer is the gradient accumulation step count
+ else: # grpo, sapo, bnpo: normalizer is the gradient accumulation step count
# Token-weighted mean entropy of active tokens, matching world_entropy below.
entropy_loss = (entropies * effective_mask).sum() / effective_mask.sum().clamp(min=1.0) / normalizer
From 0cfad37a15b4eeb45a888f7b428b74999c4c934d Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 25 Jun 2026 16:07:55 +0200
Subject: [PATCH 35/37] Accumulate to mean per-token entropy, independent of
how each loss type normalizes
---
trl/trainer/grpo_trainer.py | 29 ++++++++++-------------------
1 file changed, 10 insertions(+), 19 deletions(-)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index c2e4b211c8c..ead9707d764 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2767,27 +2767,18 @@ def _compute_loss(self, model, inputs):
# When top_entropy_quantile < 1.0, entropy_mask restricts policy gradients to high-entropy
# tokens. Use the same effective mask for the entropy bonus so it acts on the same tokens.
effective_mask = mask if entropy_mask is None else mask * entropy_mask
- # The entropy bonus must be normalized exactly like each loss type's policy loss, so that
- # entropy_coef stays on a consistent scale and gradient accumulation remains correct. The
- # normalizer differs by loss type: it is the gradient accumulation step count for the grpo
- # family, but a global token count for the cispo/dapo/vespo family.
+ # Entropy bonus = mean per-token entropy H (the documented objective L = L_policy - coef * H), so
+ # H does not depend on how each loss type normalizes its policy term. The term is computed so that
+ # it accumulates to H over the optimizer step for every loss type and matches world_entropy below.
+ # The only wrinkle is the normalizer: most loss types divide by the gradient accumulation step
+ # count, but cispo/dapo/vespo divide by a global token count.
if self.loss_type in ["cispo", "dapo", "vespo"]:
- # normalizer is a global token count, so summing the entropies accumulates over the
- # optimizer step to the global token-weighted mean entropy, matching world_entropy below.
+ # normalizer is a global token count, so summing the entropies (instead of averaging them
+ # again) makes the term accumulate over the optimizer step to the global mean per-token
+ # entropy, like the other loss types.
entropy_loss = (entropies * effective_mask).sum() / normalizer
- elif self.loss_type == "dr_grpo":
- # Dr. GRPO normalizes by the fixed budget (batch size × max completion length) instead of the
- # actual token count, to remove length bias; scale the entropy bonus the same way so that
- # entropy_coef stays consistent with the policy term when completions are shorter than the max.
- entropy_loss = (
- (entropies * effective_mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer
- )
- elif self.loss_type == "luspo":
- # luspo weights each sequence by its token count, so entropy is summed per sequence (not
- # per-token averaged) to stay on the same scale as the policy loss.
- entropy_loss = (entropies * effective_mask).sum(-1).mean() / normalizer
- else: # grpo, sapo, bnpo: normalizer is the gradient accumulation step count
- # Token-weighted mean entropy of active tokens, matching world_entropy below.
+ else:
+ # Mean per-token entropy of active tokens, scaled for gradient accumulation.
entropy_loss = (entropies * effective_mask).sum() / effective_mask.sum().clamp(min=1.0) / normalizer
# Apply the coefficient and gating from the end of the previous optimizer step, so that every
From 8e05132281547ac1222936c68479528176fcaa3f Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 25 Jun 2026 16:08:14 +0200
Subject: [PATCH 36/37] Update tests
---
tests/test_grpo_trainer.py | 29 ++++++++++++-----------------
1 file changed, 12 insertions(+), 17 deletions(-)
diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py
index 8c816bcb33a..875ee9b3b78 100644
--- a/tests/test_grpo_trainer.py
+++ b/tests/test_grpo_trainer.py
@@ -1540,14 +1540,16 @@ def test_train_with_adaptive_entropy(self):
new_param = trainer.model.get_parameter(n)
assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
- @pytest.mark.parametrize("loss_type", ["grpo", "dapo", "luspo"])
+ @pytest.mark.parametrize("loss_type", ["grpo", "dr_grpo", "dapo", "luspo"])
def test_entropy_bonus_scale(self, loss_type):
- # Regression test: the entropy bonus must be normalized like each loss type's policy loss. A previous
- # "unified" formula divided the per-token mean entropy by the loss normalizer, which for the
- # cispo/dapo/vespo family is a global token count, making the bonus ~1/sequence_length too small; and
- # it put luspo's bonus on the per-token scale instead of luspo's sequence-weighted scale. With
- # gradient_accumulation_steps=1 the per-step entropy contribution to the loss is
- # contrib = policy_loss - loss = entropy_coef * entropy_loss, so contrib / entropy reveals the scale.
+ # Regression test: the entropy bonus is the mean per-token entropy H for every loss type (documented
+ # objective L = L_policy - entropy_coef * H), so it must not inherit any loss-type-specific policy
+ # normalization. A previous "unified" formula divided H by a global token count for the
+ # cispo/dapo/vespo family, making the bonus ~1/sequence_length too small; conversely, scaling the
+ # bonus like the dr_grpo (fixed budget) or luspo (sequence-weighted) policy term would also be wrong.
+ # With gradient_accumulation_steps=1 the per-step entropy contribution to the loss is
+ # contrib = policy_loss - loss = entropy_coef * entropy_loss, so contrib / entropy must equal
+ # entropy_coef for all loss types.
entropy_coef = 0.5
dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
training_args = GRPOConfig(
@@ -1556,7 +1558,7 @@ def test_entropy_bonus_scale(self, loss_type):
learning_rate=0.1, # use higher lr because gradients are tiny and default lr can stall updates
per_device_train_batch_size=3, # reduce the batch size to reduce memory usage
num_generations=3, # reduce the number of generations to reduce memory usage
- max_completion_length=16, # long enough that the per-token vs sequence-weighted scales differ
+ max_completion_length=16, # reduce the completion length to reduce memory usage
gradient_accumulation_steps=1, # so contrib == entropy_coef * entropy_loss holds per step
loss_type=loss_type,
logging_steps=1,
@@ -1576,15 +1578,8 @@ def test_entropy_bonus_scale(self, loss_type):
assert logs
ratios = sorted((h["policy_loss"] - h["loss"]) / h["entropy"] for h in logs)
ratio = ratios[len(ratios) // 2] # median, robust to per-step noise
- if loss_type == "luspo":
- # luspo weights each sequence by its length, so the bonus is the per-sequence entropy sum: its
- # scale is entropy_coef * (mean sequence length), well above entropy_coef. The buggy per-token
- # formula gave ratio == entropy_coef.
- assert ratio > 1.5 * entropy_coef
- else:
- # grpo (and the cispo/dapo/vespo family) regularize the per-token mean entropy, so the bonus is
- # exactly entropy_coef * entropy. The buggy formula made dapo's ratio smaller by ~1/seq_len.
- assert ratio == pytest.approx(entropy_coef, rel=0.3)
+ # Every loss type regularizes the mean per-token entropy, so contrib == entropy_coef * entropy.
+ assert ratio == pytest.approx(entropy_coef, rel=0.3)
def test_train_with_adaptive_entropy_gradient_accumulation(self):
# Adaptive entropy must behave correctly under gradient accumulation: the coefficient and gating are
From bccd8ebbf274edf12ef295c15e0f0e1ae578b306 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Fri, 26 Jun 2026 07:23:05 +0200
Subject: [PATCH 37/37] Add clarifying sentence
---
docs/source/grpo_trainer.md | 2 +-
trl/trainer/grpo_config.py | 8 ++++++--
2 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
index c05cbd389b7..b1e0050e1a6 100644
--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@@ -651,7 +651,7 @@ $$
\mathcal{L}(\theta) = \mathcal{L}_{\text{GRPO}}(\theta) - \alpha \cdot \mathcal{H}(\pi_\theta),
$$
-where \\(\mathcal{H}(\pi_\theta)\\) is the mean per-token entropy of the policy and \\(\alpha\\) is the entropy coefficient.
+where \\(\mathcal{H}(\pi_\theta)\\) is the mean per-token entropy of the policy and \\(\alpha\\) is the entropy coefficient. The bonus is always the mean per-token entropy regardless of `loss_type`; it is not rescaled to match a loss type's policy normalization (e.g. Dr. GRPO's `batch_size * max_completion_length` denominator), so `entropy_coef` has the same meaning for every loss type.
**Static entropy** — a fixed coefficient throughout training:
diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py
index 5c469aecfdc..46fce0e13a9 100644
--- a/trl/trainer/grpo_config.py
+++ b/trl/trainer/grpo_config.py
@@ -297,7 +297,9 @@ class GRPOConfig(_BaseConfig):
`mask_truncated_completions=True`, only tokens from non-truncated completions are considered.
entropy_coef (`float`, *optional*, defaults to `0.0`):
Coefficient of the entropy regularization term in the loss. A positive value adds an entropy bonus that
- encourages exploration by keeping the policy from collapsing to near-deterministic outputs. When
+ encourages exploration by keeping the policy from collapsing to near-deterministic outputs. The bonus is
+ always the mean per-token entropy regardless of `loss_type`; it is not rescaled to match a loss type's
+ policy normalization, so `entropy_coef` has the same meaning for every loss type. When
`use_adaptive_entropy=True`, this serves as the initial coefficient and is updated each optimizer step.
Has no effect when set to `0.0` (default).
use_adaptive_entropy (`bool`, *optional*, defaults to `False`):
@@ -862,7 +864,9 @@ class GRPOConfig(_BaseConfig):
default=0.0,
metadata={
"help": "Coefficient of the entropy regularization term in the loss. A positive value adds an entropy "
- "bonus that encourages exploration. When `use_adaptive_entropy=True`, this serves as the initial "
+ "bonus that encourages exploration. The bonus is always the mean per-token entropy regardless of "
+ "`loss_type` (not rescaled to a loss type's policy normalization), so `entropy_coef` has the same "
+ "meaning for every loss type. When `use_adaptive_entropy=True`, this serves as the initial "
"coefficient and is updated each optimizer step. Has no effect when set to `0.0` (default)."
},
)