From ac50a1127573bee511abced1d0d0d0974e21dcfa Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:20:18 +0200
Subject: [PATCH 01/37] Add fields to GRPOConfig

---
 trl/trainer/grpo_config.py | 58 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py
index 5736b6c0ddc..230c8f50c10 100644
--- a/trl/trainer/grpo_config.py
+++ b/trl/trainer/grpo_config.py
@@ -295,6 +295,28 @@ class GRPOConfig(_BaseConfig):
             position, improving results. Range: `[0.0-1.0]`. A value of `0.0` masks all but the highest entropy token;
             `1.0` keeps all tokens. The paper recommends a value of `0.2`. If used with
             `mask_truncated_completions=True`, only tokens from non-truncated completions are considered.
+        entropy_coef (`float`, *optional*, defaults to `0.0`):
+            Coefficient of the entropy regularization term in the loss. A positive value adds an entropy bonus that
+            encourages exploration by keeping the policy from collapsing to near-deterministic outputs. When
+            `use_adaptive_entropy=True`, this serves as the initial coefficient and is updated each optimizer step.
+            Has no effect when set to `0.0` (default).
+        use_adaptive_entropy (`bool`, *optional*, defaults to `False`):
+            Whether to use adaptive entropy control, introduced in
+            [Skywork-OR1](https://huggingface.co/papers/2505.22312). When enabled, the entropy coefficient
+            `entropy_coef` is updated each optimizer step: incremented by `entropy_coef_delta` when the current
+            entropy is below `entropy_target`, and decremented otherwise. The coefficient is only applied when
+            entropy is at or below `entropy_target`.
+        entropy_coef_min (`float`, *optional*, defaults to `0.0`):
+            Lower bound for the entropy coefficient when using adaptive entropy control.
+        entropy_coef_max (`float`, *optional*, defaults to `1.0`):
+            Upper bound for the entropy coefficient when using adaptive entropy control.
+        entropy_coef_delta (`float`, *optional*, defaults to `0.005`):
+            Step size for adjusting the entropy coefficient at each optimizer step during adaptive entropy control.
+        entropy_target (`float`, *optional*, defaults to `0.2`):
+            Target mean per-token entropy (in nats) used by adaptive entropy control. The coefficient is only
+            applied when the current entropy falls at or below this value. Typical language models have per-token
+            entropies in the range 2–10 nats; the default of `0.2` nearly always triggers regularization, so users
+            should tune this to a value appropriate for their model and task.
         max_tool_calling_iterations (`int`, *optional*):
             Maximum number of tool-calling turns when training an agent. If `None`, there is no limit and generation
             stops when the model generates a response turn with no tool calls or when the total response length reaches
@@ -832,6 +854,42 @@ class GRPOConfig(_BaseConfig):
             "non-truncated completions are considered."
         },
     )
+    entropy_coef: float = field(
+        default=0.0,
+        metadata={
+            "help": "Coefficient of the entropy regularization term in the loss. A positive value adds an entropy "
+            "bonus that encourages exploration. When `use_adaptive_entropy=True`, this serves as the initial "
+            "coefficient and is updated each optimizer step. Has no effect when set to `0.0` (default)."
+        },
+    )
+    use_adaptive_entropy: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use adaptive entropy control, introduced in Skywork-OR1 "
+            "(https://huggingface.co/papers/2505.22312). When enabled, `entropy_coef` is incremented by "
+            "`entropy_coef_delta` when entropy is below `entropy_target`, and decremented otherwise."
+        },
+    )
+    entropy_coef_min: float = field(
+        default=0.0,
+        metadata={"help": "Lower bound for the entropy coefficient when using adaptive entropy control."},
+    )
+    entropy_coef_max: float = field(
+        default=1.0,
+        metadata={"help": "Upper bound for the entropy coefficient when using adaptive entropy control."},
+    )
+    entropy_coef_delta: float = field(
+        default=0.005,
+        metadata={"help": "Step size for adjusting the entropy coefficient during adaptive entropy control."},
+    )
+    entropy_target: float = field(
+        default=0.2,
+        metadata={
+            "help": "Target mean per-token entropy (nats) for adaptive entropy control. The coefficient is only "
+            "applied when current entropy is at or below this value. Typical language models have per-token "
+            "entropies of 2–10 nats; the default of 0.2 nearly always triggers regularization, so tune this."
+        },
+    )
     max_tool_calling_iterations: int | None = field(
         default=None,
         metadata={

From dcaaf676b7733ff7c190413fc47382e0977ff18d Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:21:20 +0200
Subject: [PATCH 02/37] Add init fields to GRPOTrainer

---
 trl/trainer/grpo_trainer.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index a7a673e8784..5d6b71ffc8c 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -667,6 +667,10 @@ def __init__(
                 f"Unknown importance sampling level: {self.importance_sampling_level}. "
                 "Possible values are 'token' and 'sequence'."
             )
+        self.entropy_coef = args.entropy_coef
+        self.use_adaptive_entropy = args.use_adaptive_entropy
+        if self.use_liger_kernel and self.entropy_coef != 0.0:
+            raise NotImplementedError("Entropy bonus is not supported with Liger kernel.")
 
         # Datasets
         self.shuffle_dataset = args.shuffle_dataset

From 0f6306e748ec62f56da01e1ce24645183cfc6a2e Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:27:28 +0200
Subject: [PATCH 03/37] Update _compute_loss

---
 trl/trainer/grpo_trainer.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 5d6b71ffc8c..24c436beaf1 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2740,6 +2740,33 @@ def _compute_loss(self, model, inputs):
         else:
             raise ValueError(f"Unknown loss type: {self.loss_type}")
 
+        # Entropy bonus: add entropy regularization to encourage exploration
+        if self.entropy_coef != 0.0:
+            if self.loss_type in ["grpo", "sapo", "luspo"]:
+                entropy_loss = ((entropies * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() / normalizer
+            elif self.loss_type == "bnpo":
+                entropy_loss = (entropies * mask).sum() / mask.sum().clamp(min=1.0) / normalizer
+            elif self.loss_type == "dr_grpo":
+                entropy_loss = (entropies * mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer
+            elif self.loss_type in ["cispo", "dapo", "vespo"]:
+                entropy_loss = (entropies * mask).sum() / normalizer
+
+            world_entropy = self.accelerator.reduce(entropy_loss.detach(), reduction="mean").item()
+            if self.use_adaptive_entropy:
+                if world_entropy < self.args.entropy_target:
+                    self.entropy_coef = min(
+                        self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max
+                    )
+                else:
+                    self.entropy_coef = max(
+                        self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min
+                    )
+                apply_coef = self.entropy_coef if world_entropy <= self.args.entropy_target else 0.0
+            else:
+                apply_coef = self.entropy_coef
+
+            loss = loss - apply_coef * entropy_loss
+
         # The policy loss above is scaled for gradient accumulation (HF auto-scaling is off here), so scale aux too
         if self.aux_loss_enabled:
             normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0
@@ -2755,6 +2782,11 @@ def masked_batch_mean(x):
             else:
                 return (x * mask).sum() / completion_token_count
 
+        self._metrics[mode]["policy_loss"].append(self.accelerator.reduce(loss.detach(), reduction="mean").item())
+        if self.entropy_coef != 0.0:
+            self._metrics[mode]["entropy_loss"].append(world_entropy)
+            self._metrics[mode]["entropy_coef"].append(self.entropy_coef)
+
         if self.beta != 0.0:
             mean_kl = masked_batch_mean(per_token_kl)
             self._metrics[mode]["kl"].append(self.accelerator.gather(mean_kl).nanmean().item())

From 9b1cc655cd9a8802b4fce438b507ab4ea67b1ef6 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:27:55 +0200
Subject: [PATCH 04/37] Add checkpoint persistence

---
 trl/trainer/grpo_trainer.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 24c436beaf1..95e04e79fb2 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -17,6 +17,7 @@
 import copy
 import importlib.resources as pkg_resources
 import inspect
+import json
 import math
 import os
 import sys
@@ -2921,3 +2922,16 @@ def _save_checkpoint(self, model, trial):
             model_name = self.args.hub_model_id.split("/")[-1]
         self.create_model_card(model_name=model_name)
         super()._save_checkpoint(model, trial)
+        if self.use_adaptive_entropy and self.is_world_process_zero():
+            checkpoint_folder = f"checkpoint-{self.state.global_step}"
+            output_dir = os.path.join(self.args.output_dir, checkpoint_folder)
+            with open(os.path.join(output_dir, "entropy_ctrl_state.json"), "w") as f:
+                json.dump({"entropy_coef": self.entropy_coef}, f)
+
+    def _load_optimizer_and_scheduler(self, checkpoint):
+        super()._load_optimizer_and_scheduler(checkpoint)
+        if self.use_adaptive_entropy and checkpoint is not None:
+            path = os.path.join(checkpoint, "entropy_ctrl_state.json")
+            if os.path.exists(path):
+                with open(path) as f:
+                    self.entropy_coef = json.load(f)["entropy_coef"]

From e9447139352eb96fb0db579c2f3e843b92db14a3 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:28:58 +0200
Subject: [PATCH 05/37] Update GRPO docs

---
 docs/source/grpo_trainer.md | 43 +++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
index 0621d5ee689..940f5947026 100644
--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@@ -185,7 +185,10 @@ While training and evaluating, we record the following reward metrics:
 - `reward`: The overall average reward after summing rewards across functions (weighted by `reward_weights`).
 - `reward_std`: The standard deviation of summed rewards across functions (weighted by `reward_weights`), computed over the full batch.
 - `frac_reward_zero_std`: The fraction of samples in the generation batch with a reward std of zero, implying there is little diversity for that prompt (all answers are correct or incorrect).
+- `policy_loss`: The policy gradient loss value (before any entropy bonus).
 - `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.)
+- `entropy_loss`: The aggregated entropy used as the regularization term. Logged only if `entropy_coef` is nonzero.
+- `entropy_coef`: The current entropy coefficient. Logged only if `entropy_coef` is nonzero. Changes each optimizer step when `use_adaptive_entropy=True`.
 - `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero.
 - `clip_ratio/region_mean`: The ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities where the GRPO objective is clipped to stay within the trust region:  \\( \text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \quad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})} \\). A higher value means more tokens are clipped, which constrains how much the policy $\pi_\theta$ can change.
 - `clip_ratio/low_mean`: The average ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the lower bound of the trust region:  \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\).
@@ -641,6 +644,46 @@ and the reward will be computed as the sum of the rewards from each function, or
 
 Note that [`GRPOTrainer`] supports multiple reward functions of different types. See the parameters documentation for more details.
 
+### Entropy regularization
+
+To encourage exploration and prevent the policy from collapsing to near-deterministic outputs, you can add an entropy bonus to the training objective. The entropy regularization augments the GRPO loss as follows:
+
+$$
+\mathcal{L}(\theta) = \mathcal{L}_{\text{GRPO}}(\theta) - \alpha \cdot \mathcal{H}(\pi_\theta),
+$$
+
+where \\(\mathcal{H}(\pi_\theta)\\) is the mean per-token entropy of the policy and \\(\alpha\\) is the entropy coefficient.
+
+**Static entropy** — a fixed coefficient throughout training:
+
+```python
+from trl import GRPOConfig, GRPOTrainer
+
+training_args = GRPOConfig(entropy_coef=0.05, ...)
+```
+
+**Adaptive entropy** — the coefficient is updated each optimizer step based on a target entropy, as introduced in [Skywork-OR1](https://huggingface.co/papers/2505.22312). When the current entropy falls at or below `entropy_target`, the coefficient is incremented by `entropy_coef_delta`; otherwise it is decremented. The coefficient is only applied (i.e. non-zero) while entropy is at or below the target:
+
+```python
+training_args = GRPOConfig(
+    entropy_coef=0.01,          # initial coefficient
+    use_adaptive_entropy=True,
+    entropy_target=5.0,         # target mean per-token entropy (nats); tune for your model
+    entropy_coef_delta=0.005,   # step size per optimizer step
+    entropy_coef_min=0.0,
+    entropy_coef_max=1.0,
+    ...
+)
+```
+
+<Tip>
+
+Typical language models have per-token entropies of 2–10 nats. The default `entropy_target=0.2` nearly always triggers regularization; set it to a value meaningful for your model (e.g. the entropy you observe early in training, logged as the `entropy` metric).
+
+</Tip>
+
+When `use_adaptive_entropy=True`, the current entropy coefficient `entropy_coef` is saved alongside each checkpoint and restored on resume, so training is fully resumable.
+
 ### Rapid Experimentation for GRPO
 
 RapidFire AI is an open-source experimentation engine that sits on top of TRL and lets you launch multiple GRPO configurations at once, even on a single GPU. Instead of trying configurations sequentially, RapidFire lets you **see all their learning curves earlier, stop underperforming runs, and clone promising ones with new settings in flight** without restarting. For more information, see [RapidFire AI Integration](rapidfire_integration).

From f47d5a58c29ef7781d542533c885160f2b7a1515 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:29:30 +0200
Subject: [PATCH 06/37] Add tests

---
 tests/test_grpo_trainer.py | 68 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py
index d7a0b9960da..2401b8323c5 100644
--- a/tests/test_grpo_trainer.py
+++ b/tests/test_grpo_trainer.py
@@ -1474,6 +1474,74 @@ def test_train_with_cast_lm_head_to_fp32(self, model_name):
             new_param = trainer.model.get_parameter(n)
             assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
 
+    def test_train_with_static_entropy(self):
+        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+        training_args = GRPOConfig(
+            output_dir=self.tmp_dir,
+            learning_rate=0.1,  # use higher lr because gradients are tiny and default lr can stall updates
+            per_device_train_batch_size=3,  # reduce the batch size to reduce memory usage
+            num_generations=3,  # reduce the number of generations to reduce memory usage
+            max_completion_length=8,  # reduce the completion length to reduce memory usage
+            report_to="none",
+            entropy_coef=0.1,
+        )
+        trainer = GRPOTrainer(
+            model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+            reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+            args=training_args,
+            train_dataset=dataset,
+        )
+
+        previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+        trainer.train()
+
+        assert trainer.state.log_history[-1]["train_loss"] is not None
+        assert trainer.state.log_history[-1]["policy_loss"] is not None
+        assert trainer.state.log_history[-1]["entropy_loss"] is not None
+        assert trainer.state.log_history[-1]["entropy_coef"] is not None
+
+        # Check that the params have changed
+        for n, param in previous_trainable_params.items():
+            new_param = trainer.model.get_parameter(n)
+            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+
+    def test_train_with_adaptive_entropy(self):
+        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+        training_args = GRPOConfig(
+            output_dir=self.tmp_dir,
+            learning_rate=0.1,  # use higher lr because gradients are tiny and default lr can stall updates
+            per_device_train_batch_size=3,  # reduce the batch size to reduce memory usage
+            num_generations=3,  # reduce the number of generations to reduce memory usage
+            max_completion_length=8,  # reduce the completion length to reduce memory usage
+            report_to="none",
+            entropy_coef=0.01,
+            use_adaptive_entropy=True,
+            entropy_target=15.0,  # above any realistic entropy → coef is always incremented
+        )
+        trainer = GRPOTrainer(
+            model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+            reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+            args=training_args,
+            train_dataset=dataset,
+        )
+
+        previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+        trainer.train()
+
+        assert trainer.state.log_history[-1]["train_loss"] is not None
+        assert trainer.state.log_history[-1]["policy_loss"] is not None
+        assert trainer.state.log_history[-1]["entropy_loss"] is not None
+        assert trainer.state.log_history[-1]["entropy_coef"] is not None
+        # Coefficient should have increased since entropy < target throughout training
+        assert trainer.entropy_coef > 0.01
+
+        # Check that the params have changed
+        for n, param in previous_trainable_params.items():
+            new_param = trainer.model.get_parameter(n)
+            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+
     def test_train_with_entropy_filter(self):
         dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
         training_args = GRPOConfig(

From 2484e70c30e2f7f5a443bed84ef5fd6b6a6c1ded Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 12:48:14 +0000
Subject: [PATCH 07/37] Address issues from review

---
 docs/source/grpo_trainer.md |  4 ++--
 trl/trainer/grpo_trainer.py | 32 ++++++++++++++++++++------------
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
index 940f5947026..c738b26a0c9 100644
--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@@ -187,8 +187,8 @@ While training and evaluating, we record the following reward metrics:
 - `frac_reward_zero_std`: The fraction of samples in the generation batch with a reward std of zero, implying there is little diversity for that prompt (all answers are correct or incorrect).
 - `policy_loss`: The policy gradient loss value (before any entropy bonus).
 - `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.)
-- `entropy_loss`: The aggregated entropy used as the regularization term. Logged only if `entropy_coef` is nonzero.
-- `entropy_coef`: The current entropy coefficient. Logged only if `entropy_coef` is nonzero. Changes each optimizer step when `use_adaptive_entropy=True`.
+- `entropy_loss`: The aggregated entropy used as the regularization term. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`.
+- `entropy_coef`: The current entropy coefficient. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. Updated once per optimizer step when `use_adaptive_entropy=True`.
 - `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero.
 - `clip_ratio/region_mean`: The ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities where the GRPO objective is clipped to stay within the trust region:  \\( \text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \quad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})} \\). A higher value means more tokens are clipped, which constrains how much the policy $\pi_\theta$ can change.
 - `clip_ratio/low_mean`: The average ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the lower bound of the trust region:  \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\).
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 95e04e79fb2..1b8c4ad2e48 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2741,8 +2741,14 @@ def _compute_loss(self, model, inputs):
         else:
             raise ValueError(f"Unknown loss type: {self.loss_type}")
 
-        # Entropy bonus: add entropy regularization to encourage exploration
-        if self.entropy_coef != 0.0:
+        # Capture the pure policy loss for logging before entropy/aux modify it
+        policy_loss = loss.detach()
+
+        # Entropy bonus: add entropy regularization to encourage exploration.
+        # Gate: run whenever a non-zero static coef is set OR adaptive mode is enabled. Adaptive must always run even
+        # when self.entropy_coef has been decremented to entropy_coef_min (default 0) so it can recover once entropy
+        # drops below entropy_target again.
+        if self.entropy_coef != 0.0 or self.use_adaptive_entropy:
             if self.loss_type in ["grpo", "sapo", "luspo"]:
                 entropy_loss = ((entropies * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() / normalizer
             elif self.loss_type == "bnpo":
@@ -2754,14 +2760,16 @@ def _compute_loss(self, model, inputs):
 
             world_entropy = self.accelerator.reduce(entropy_loss.detach(), reduction="mean").item()
             if self.use_adaptive_entropy:
-                if world_entropy < self.args.entropy_target:
-                    self.entropy_coef = min(
-                        self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max
-                    )
-                else:
-                    self.entropy_coef = max(
-                        self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min
-                    )
+                # Update the coefficient once per optimizer step, not per micro-batch
+                if self.accelerator.sync_gradients:
+                    if world_entropy < self.args.entropy_target:
+                        self.entropy_coef = min(
+                            self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max
+                        )
+                    else:
+                        self.entropy_coef = max(
+                            self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min
+                        )
                 apply_coef = self.entropy_coef if world_entropy <= self.args.entropy_target else 0.0
             else:
                 apply_coef = self.entropy_coef
@@ -2783,8 +2791,8 @@ def masked_batch_mean(x):
             else:
                 return (x * mask).sum() / completion_token_count
 
-        self._metrics[mode]["policy_loss"].append(self.accelerator.reduce(loss.detach(), reduction="mean").item())
-        if self.entropy_coef != 0.0:
+        self._metrics[mode]["policy_loss"].append(self.accelerator.reduce(policy_loss, reduction="mean").item())
+        if self.entropy_coef != 0.0 or self.use_adaptive_entropy:
             self._metrics[mode]["entropy_loss"].append(world_entropy)
             self._metrics[mode]["entropy_coef"].append(self.entropy_coef)
 

From 45077474beff6be14afae94b44946a66f4326cbc Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:04:57 +0000
Subject: [PATCH 08/37] Fix wrong entropy for adaptive control

---
 docs/source/grpo_trainer.md | 2 +-
 trl/trainer/grpo_trainer.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
index c738b26a0c9..7f19eb03cc1 100644
--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@@ -187,7 +187,7 @@ While training and evaluating, we record the following reward metrics:
 - `frac_reward_zero_std`: The fraction of samples in the generation batch with a reward std of zero, implying there is little diversity for that prompt (all answers are correct or incorrect).
 - `policy_loss`: The policy gradient loss value (before any entropy bonus).
 - `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.)
-- `entropy_loss`: The aggregated entropy used as the regularization term. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`.
+- `entropy_loss`: Mean per-token entropy (nats) used as the regularization signal. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`.
 - `entropy_coef`: The current entropy coefficient. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. Updated once per optimizer step when `use_adaptive_entropy=True`.
 - `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero.
 - `clip_ratio/region_mean`: The ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities where the GRPO objective is clipped to stay within the trust region:  \\( \text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \quad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})} \\). A higher value means more tokens are clipped, which constrains how much the policy $\pi_\theta$ can change.
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 1b8c4ad2e48..250a7650153 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2758,7 +2758,11 @@ def _compute_loss(self, model, inputs):
             elif self.loss_type in ["cispo", "dapo", "vespo"]:
                 entropy_loss = (entropies * mask).sum() / normalizer
 
-            world_entropy = self.accelerator.reduce(entropy_loss.detach(), reduction="mean").item()
+            # Mean per-token entropy in nats across ranks — computed independently of the loss
+            # normalizer so its scale matches entropy_target (loss-scaled entropy_loss would not).
+            world_entropy = self.accelerator.reduce(
+                ((entropies * mask).sum() / mask.sum().clamp(min=1.0)).detach(), reduction="mean"
+            ).item()
             if self.use_adaptive_entropy:
                 # Update the coefficient once per optimizer step, not per micro-batch
                 if self.accelerator.sync_gradients:

From 9b70a4a7986234deabd4fd5480d10340c68bd8f3 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:05:23 +0000
Subject: [PATCH 09/37] Fix Liger skips adaptive entropy guard

---
 trl/trainer/grpo_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 250a7650153..89d2b1e1044 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -670,7 +670,7 @@ def __init__(
             )
         self.entropy_coef = args.entropy_coef
         self.use_adaptive_entropy = args.use_adaptive_entropy
-        if self.use_liger_kernel and self.entropy_coef != 0.0:
+        if self.use_liger_kernel and (self.entropy_coef != 0.0 or self.use_adaptive_entropy):
             raise NotImplementedError("Entropy bonus is not supported with Liger kernel.")
 
         # Datasets

From 9d79e4a8e6896a5ad40b967e0ff8f62eb1c95e1c Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:49:03 +0000
Subject: [PATCH 10/37] Fix inconsistent inequality

---
 trl/trainer/grpo_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 89d2b1e1044..b4f18fe59b7 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2766,7 +2766,7 @@ def _compute_loss(self, model, inputs):
             if self.use_adaptive_entropy:
                 # Update the coefficient once per optimizer step, not per micro-batch
                 if self.accelerator.sync_gradients:
-                    if world_entropy < self.args.entropy_target:
+                    if world_entropy <= self.args.entropy_target:
                         self.entropy_coef = min(
                             self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max
                         )

From 46c8a64f6ba5ad156cbcbc24a1a2a7f2d4b89575 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:55:09 +0000
Subject: [PATCH 11/37] Fix mean reduction with sum-count-divide

---
 trl/trainer/grpo_trainer.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index b4f18fe59b7..b93369fc27e 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2758,11 +2758,13 @@ def _compute_loss(self, model, inputs):
             elif self.loss_type in ["cispo", "dapo", "vespo"]:
                 entropy_loss = (entropies * mask).sum() / normalizer
 
-            # Mean per-token entropy in nats across ranks — computed independently of the loss
-            # normalizer so its scale matches entropy_target (loss-scaled entropy_loss would not).
-            world_entropy = self.accelerator.reduce(
-                ((entropies * mask).sum() / mask.sum().clamp(min=1.0)).detach(), reduction="mean"
-            ).item()
+            # True global mean per-token entropy (nats): reduce sum and token count jointly so
+            # that ranks with fewer tokens don't get equal weight (averaging per-rank means would
+            # be biased when completion lengths differ across ranks).
+            entropy_stats = self.accelerator.reduce(
+                torch.stack([(entropies * mask).sum(), mask.sum()]).detach(), reduction="sum"
+            )
+            world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item()
             if self.use_adaptive_entropy:
                 # Update the coefficient once per optimizer step, not per micro-batch
                 if self.accelerator.sync_gradients:

From 3f7a6692080ae0f0b6e64804aa4ef8b8f64522ac Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 14:19:33 +0000
Subject: [PATCH 12/37] Set _last_world_entropy at init

---
 trl/trainer/grpo_trainer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index b93369fc27e..b1727d58d62 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -670,6 +670,9 @@ def __init__(
             )
         self.entropy_coef = args.entropy_coef
         self.use_adaptive_entropy = args.use_adaptive_entropy
+        # Cached entropy from the last optimizer step; inf so the first accumulation window
+        # applies no bonus until a real measurement arrives (conservative default).
+        self._last_world_entropy = float("inf")
         if self.use_liger_kernel and (self.entropy_coef != 0.0 or self.use_adaptive_entropy):
             raise NotImplementedError("Entropy bonus is not supported with Liger kernel.")
 

From a05c97907367a3f0e7754d6b0189b57f1183b1be Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 14:21:15 +0000
Subject: [PATCH 13/37] Cache world_entropy at sync point and use that cached
 value for apply_coef

---
 trl/trainer/grpo_trainer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index b1727d58d62..0d00f40257b 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2769,7 +2769,10 @@ def _compute_loss(self, model, inputs):
             )
             world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item()
             if self.use_adaptive_entropy:
-                # Update the coefficient once per optimizer step, not per micro-batch
+                # Update coefficient and cache entropy once per optimizer step, not per micro-batch.
+                # apply_coef uses the cached value so all micro-batches within one accumulation
+                # window apply the same bonus (using per-micro-batch world_entropy would cause
+                # the bonus to toggle on/off unpredictably across accumulation steps).
                 if self.accelerator.sync_gradients:
                     if world_entropy <= self.args.entropy_target:
                         self.entropy_coef = min(
@@ -2779,7 +2782,8 @@ def _compute_loss(self, model, inputs):
                         self.entropy_coef = max(
                             self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min
                         )
-                apply_coef = self.entropy_coef if world_entropy <= self.args.entropy_target else 0.0
+                    self._last_world_entropy = world_entropy
+                apply_coef = self.entropy_coef if self._last_world_entropy <= self.args.entropy_target else 0.0
             else:
                 apply_coef = self.entropy_coef
 

From fe03dd1037fb4c79e25ae2bbe9dee686db17ccad Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 14:30:58 +0000
Subject: [PATCH 14/37] Persist also _last_world_entropy

---
 trl/trainer/grpo_trainer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 0d00f40257b..7ace9470a58 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2947,7 +2947,7 @@ def _save_checkpoint(self, model, trial):
             checkpoint_folder = f"checkpoint-{self.state.global_step}"
             output_dir = os.path.join(self.args.output_dir, checkpoint_folder)
             with open(os.path.join(output_dir, "entropy_ctrl_state.json"), "w") as f:
-                json.dump({"entropy_coef": self.entropy_coef}, f)
+                json.dump({"entropy_coef": self.entropy_coef, "last_world_entropy": self._last_world_entropy}, f)
 
     def _load_optimizer_and_scheduler(self, checkpoint):
         super()._load_optimizer_and_scheduler(checkpoint)
@@ -2955,4 +2955,6 @@ def _load_optimizer_and_scheduler(self, checkpoint):
             path = os.path.join(checkpoint, "entropy_ctrl_state.json")
             if os.path.exists(path):
                 with open(path) as f:
-                    self.entropy_coef = json.load(f)["entropy_coef"]
+                    state = json.load(f)
+                self.entropy_coef = state["entropy_coef"]
+                self._last_world_entropy = state.get("last_world_entropy", float("inf"))

From f099349fa7d7d2fc5a74868b6f56e1b70b33d9ed Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 22 Jun 2026 14:31:24 +0000
Subject: [PATCH 15/37] Add paper_index entry

---
 docs/source/paper_index.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/docs/source/paper_index.md b/docs/source/paper_index.md
index e8491893389..ea85a7b8594 100644
--- a/docs/source/paper_index.md
+++ b/docs/source/paper_index.md
@@ -225,6 +225,27 @@ training_args = GRPOConfig(
 )
 ```
 
+### Skywork-OR1: Open Reasoning Models
+
+**📜 Paper**: https://huggingface.co/papers/2505.22312
+
+Skywork-OR1 is a family of open reasoning models trained with GRPO. The paper introduces **adaptive entropy control**: an entropy regularization term `−α·H(π_θ)` is added to the GRPO objective, and the coefficient `α` is automatically adjusted each optimizer step. When the model's mean per-token entropy falls at or below a target, `α` is incremented to encourage more exploration; otherwise it is decremented. The bonus is only applied while entropy is at or below the target. To replicate this adaptive entropy control, use the following configuration:
+
+```python
+from trl import GRPOConfig, GRPOTrainer
+
+training_args = GRPOConfig(
+    use_adaptive_entropy=True,   # enable adaptive entropy control (Section 3.3 of the paper)
+    entropy_coef=0.01,           # initial entropy regularization coefficient
+    entropy_target=5.0,          # target mean per-token entropy (nats); tune for your model
+    entropy_coef_delta=0.005,    # step size for coefficient updates per optimizer step
+)
+trainer = GRPOTrainer(
+    ...,
+    args=training_args,
+)
+```
+
 ### Beyond the 80/20 Rule: High-Entropy Minority Tokens Drive Effective Reinforcement Learning for LLM Reasoning
 
 **📜 Paper**: https://huggingface.co/papers/2506.01939

From 5288cd5e2987a0952e17d9760346e66715dec537 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 13:18:06 +0000
Subject: [PATCH 16/37] Capture the pure policy loss before normalization

---
 trl/trainer/grpo_trainer.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 7ace9470a58..9c0c34a46a4 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2724,29 +2724,31 @@ def _compute_loss(self, model, inputs):
         if self.loss_type in ["grpo", "sapo"]:
             loss = ((per_token_loss * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean()
             normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0  # no accum in eval
+            policy_loss = loss.detach()
             loss = loss / normalizer
         elif self.loss_type == "bnpo":
             loss = (per_token_loss * mask).sum() / mask.sum().clamp(min=1.0)
             normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0  # no accum in eval
+            policy_loss = loss.detach()
             loss = loss / normalizer
         elif self.loss_type == "dr_grpo":
             loss = (per_token_loss * mask).sum() / (per_token_loss.size(0) * self.max_completion_length)
             normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0  # no accum in eval
+            policy_loss = loss.detach()
             loss = loss / normalizer
         elif self.loss_type in ["cispo", "dapo", "vespo"]:
             normalizer = inputs["num_items_in_batch"] / self.accelerator.num_processes
             loss = (per_token_loss * mask).sum() / normalizer
+            policy_loss = loss.detach()
         elif self.loss_type == "luspo":
             # Unless importance_sampling_level="token" (not recommended here), per_token_loss is expected to be (B, 1)
             loss = (per_token_loss * mask.sum(1, keepdim=True)).mean()
             normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0
+            policy_loss = loss.detach()
             loss = loss / normalizer
         else:
             raise ValueError(f"Unknown loss type: {self.loss_type}")
 
-        # Capture the pure policy loss for logging before entropy/aux modify it
-        policy_loss = loss.detach()
-
         # Entropy bonus: add entropy regularization to encourage exploration.
         # Gate: run whenever a non-zero static coef is set OR adaptive mode is enabled. Adaptive must always run even
         # when self.entropy_coef has been decremented to entropy_coef_min (default 0) so it can recover once entropy

From 03f4208c02b3ae854ef8378360d20b340590d9d9 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 13:18:28 +0000
Subject: [PATCH 17/37] Fix luspo loss

---
 trl/trainer/grpo_trainer.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 9c0c34a46a4..5c06e0bc368 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2754,7 +2754,7 @@ def _compute_loss(self, model, inputs):
         # when self.entropy_coef has been decremented to entropy_coef_min (default 0) so it can recover once entropy
         # drops below entropy_target again.
         if self.entropy_coef != 0.0 or self.use_adaptive_entropy:
-            if self.loss_type in ["grpo", "sapo", "luspo"]:
+            if self.loss_type in ["grpo", "sapo"]:
                 entropy_loss = ((entropies * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() / normalizer
             elif self.loss_type == "bnpo":
                 entropy_loss = (entropies * mask).sum() / mask.sum().clamp(min=1.0) / normalizer
@@ -2762,6 +2762,9 @@ def _compute_loss(self, model, inputs):
                 entropy_loss = (entropies * mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer
             elif self.loss_type in ["cispo", "dapo", "vespo"]:
                 entropy_loss = (entropies * mask).sum() / normalizer
+            elif self.loss_type == "luspo":
+                # luspo weights each sequence by its token count, so entropy is summed (not per-token averaged) per sequence
+                entropy_loss = (entropies * mask).sum(-1).mean() / normalizer
 
             # True global mean per-token entropy (nats): reduce sum and token count jointly so
             # that ranks with fewer tokens don't get equal weight (averaging per-rank means would

From dbc0c7592c6437d0c45b2cdc78043e90c7cdc75d Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 13:19:43 +0000
Subject: [PATCH 18/37] Gate policy_loss logging and align style

---
 trl/trainer/grpo_trainer.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 5c06e0bc368..db4926a71ec 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2794,6 +2794,10 @@ def _compute_loss(self, model, inputs):
 
             loss = loss - apply_coef * entropy_loss
 
+            self._metrics[mode]["policy_loss"].append(self.accelerator.gather(policy_loss).nanmean().item())
+            self._metrics[mode]["entropy_loss"].append(world_entropy)
+            self._metrics[mode]["entropy_coef"].append(self.entropy_coef)
+
         # The policy loss above is scaled for gradient accumulation (HF auto-scaling is off here), so scale aux too
         if self.aux_loss_enabled:
             normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0
@@ -2809,11 +2813,6 @@ def masked_batch_mean(x):
             else:
                 return (x * mask).sum() / completion_token_count
 
-        self._metrics[mode]["policy_loss"].append(self.accelerator.reduce(policy_loss, reduction="mean").item())
-        if self.entropy_coef != 0.0 or self.use_adaptive_entropy:
-            self._metrics[mode]["entropy_loss"].append(world_entropy)
-            self._metrics[mode]["entropy_coef"].append(self.entropy_coef)
-
         if self.beta != 0.0:
             mean_kl = masked_batch_mean(per_token_kl)
             self._metrics[mode]["kl"].append(self.accelerator.gather(mean_kl).nanmean().item())

From 506fbf9f4cd8b838b90c6ed6982ae432c7baf70c Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 14:01:00 +0000
Subject: [PATCH 19/37] Fix entropy state written to wrong path

---
 trl/trainer/grpo_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index aeef9676067..1576425522d 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2959,7 +2959,7 @@ def _save_checkpoint(self, model, trial):
         super()._save_checkpoint(model, trial)
         if self.use_adaptive_entropy and self.is_world_process_zero():
             checkpoint_folder = f"checkpoint-{self.state.global_step}"
-            output_dir = os.path.join(self.args.output_dir, checkpoint_folder)
+            output_dir = os.path.join(self._get_output_dir(trial=trial), checkpoint_folder)
             with open(os.path.join(output_dir, "entropy_ctrl_state.json"), "w") as f:
                 json.dump({"entropy_coef": self.entropy_coef, "last_world_entropy": self._last_world_entropy}, f)
 

From 8a6b53dde71d9340a2491bbc83744fca9e35484d Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 14:01:43 +0000
Subject: [PATCH 20/37] Fix is_world_process_zero() vs args.should_save guard
 mismatch

---
 trl/trainer/grpo_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 1576425522d..e6f60c1831f 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2957,7 +2957,7 @@ def _save_checkpoint(self, model, trial):
             model_name = self.args.hub_model_id.split("/")[-1]
         self.create_model_card(model_name=model_name)
         super()._save_checkpoint(model, trial)
-        if self.use_adaptive_entropy and self.is_world_process_zero():
+        if self.use_adaptive_entropy and self.args.should_save:
             checkpoint_folder = f"checkpoint-{self.state.global_step}"
             output_dir = os.path.join(self._get_output_dir(trial=trial), checkpoint_folder)
             with open(os.path.join(output_dir, "entropy_ctrl_state.json"), "w") as f:

From 474b30c479c795403a041439e781c66d9974e345 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 14:03:03 +0000
Subject: [PATCH 21/37] Update docs: policy_loss only logged inside entropy
 block

---
 docs/source/grpo_trainer.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
index 7f19eb03cc1..bdd366ee3d5 100644
--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@@ -185,7 +185,7 @@ While training and evaluating, we record the following reward metrics:
 - `reward`: The overall average reward after summing rewards across functions (weighted by `reward_weights`).
 - `reward_std`: The standard deviation of summed rewards across functions (weighted by `reward_weights`), computed over the full batch.
 - `frac_reward_zero_std`: The fraction of samples in the generation batch with a reward std of zero, implying there is little diversity for that prompt (all answers are correct or incorrect).
-- `policy_loss`: The policy gradient loss value (before any entropy bonus).
+- `policy_loss`: The policy gradient loss value (before any entropy bonus). Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`.
 - `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.)
 - `entropy_loss`: Mean per-token entropy (nats) used as the regularization signal. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`.
 - `entropy_coef`: The current entropy coefficient. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. Updated once per optimizer step when `use_adaptive_entropy=True`.

From a0b9ec68aa11fe3399dc503796cc4296d3240b8e Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 14:04:37 +0000
Subject: [PATCH 22/37] Log entropy_coef only when sync_gradients=True

---
 trl/trainer/grpo_trainer.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index e6f60c1831f..03bbbdda207 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2806,7 +2806,10 @@ def _compute_loss(self, model, inputs):
 
             self._metrics[mode]["policy_loss"].append(self.accelerator.gather(policy_loss).nanmean().item())
             self._metrics[mode]["entropy_loss"].append(world_entropy)
-            self._metrics[mode]["entropy_coef"].append(self.entropy_coef)
+            # Log entropy_coef only on optimizer-step boundaries: it updates once per step (sync_gradients),
+            # so logging K identical values per step would dilute the metric with stale data.
+            if self.accelerator.sync_gradients:
+                self._metrics[mode]["entropy_coef"].append(self.entropy_coef)
 
         # The policy loss above is scaled for gradient accumulation (HF auto-scaling is off here), so scale aux too
         if self.aux_loss_enabled:

From 608b1e0a8772697eebf4b442dbebddd1cdde80eb Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 14:06:30 +0000
Subject: [PATCH 23/37] Add guard for entropy-loss dispatch matching
 policy-loss dispatch

---
 trl/trainer/grpo_trainer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 03bbbdda207..387eacd15fb 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2775,6 +2775,8 @@ def _compute_loss(self, model, inputs):
             elif self.loss_type == "luspo":
                 # luspo weights each sequence by its token count, so entropy is summed (not per-token averaged) per sequence
                 entropy_loss = (entropies * mask).sum(-1).mean() / normalizer
+            else:
+                raise ValueError(f"Unknown loss type: {self.loss_type}")
 
             # True global mean per-token entropy (nats): reduce sum and token count jointly so
             # that ranks with fewer tokens don't get equal weight (averaging per-rank means would

From 81841ad99cb4e0f94ddc2fbffaf5b66286a674f8 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 14:18:38 +0000
Subject: [PATCH 24/37] Remove entropy_loss

---
 docs/source/grpo_trainer.md |  3 +--
 tests/test_grpo_trainer.py  |  2 --
 trl/trainer/grpo_trainer.py | 20 ++++++++------------
 3 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
index bdd366ee3d5..e6fb82f1f88 100644
--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@@ -187,8 +187,7 @@ While training and evaluating, we record the following reward metrics:
 - `frac_reward_zero_std`: The fraction of samples in the generation batch with a reward std of zero, implying there is little diversity for that prompt (all answers are correct or incorrect).
 - `policy_loss`: The policy gradient loss value (before any entropy bonus). Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`.
 - `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.)
-- `entropy_loss`: Mean per-token entropy (nats) used as the regularization signal. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`.
-- `entropy_coef`: The current entropy coefficient. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. Updated once per optimizer step when `use_adaptive_entropy=True`.
+- `entropy_coef`: The current entropy regularization coefficient. Logged when `entropy_coef` is nonzero or `use_adaptive_entropy=True`. Updated once per optimizer step when `use_adaptive_entropy=True`.
 - `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero.
 - `clip_ratio/region_mean`: The ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities where the GRPO objective is clipped to stay within the trust region:  \\( \text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \quad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})} \\). A higher value means more tokens are clipped, which constrains how much the policy $\pi_\theta$ can change.
 - `clip_ratio/low_mean`: The average ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the lower bound of the trust region:  \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\).
diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py
index 186a815d2eb..29f07aff5ff 100644
--- a/tests/test_grpo_trainer.py
+++ b/tests/test_grpo_trainer.py
@@ -1498,7 +1498,6 @@ def test_train_with_static_entropy(self):
 
         assert trainer.state.log_history[-1]["train_loss"] is not None
         assert trainer.state.log_history[-1]["policy_loss"] is not None
-        assert trainer.state.log_history[-1]["entropy_loss"] is not None
         assert trainer.state.log_history[-1]["entropy_coef"] is not None
 
         # Check that the params have changed
@@ -1532,7 +1531,6 @@ def test_train_with_adaptive_entropy(self):
 
         assert trainer.state.log_history[-1]["train_loss"] is not None
         assert trainer.state.log_history[-1]["policy_loss"] is not None
-        assert trainer.state.log_history[-1]["entropy_loss"] is not None
         assert trainer.state.log_history[-1]["entropy_coef"] is not None
         # Coefficient should have increased since entropy < target throughout training
         assert trainer.entropy_coef > 0.01
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 387eacd15fb..21fadcc16b6 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2778,18 +2778,15 @@ def _compute_loss(self, model, inputs):
             else:
                 raise ValueError(f"Unknown loss type: {self.loss_type}")
 
-            # True global mean per-token entropy (nats): reduce sum and token count jointly so
-            # that ranks with fewer tokens don't get equal weight (averaging per-rank means would
-            # be biased when completion lengths differ across ranks).
-            entropy_stats = self.accelerator.reduce(
-                torch.stack([(entropies * mask).sum(), mask.sum()]).detach(), reduction="sum"
-            )
-            world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item()
             if self.use_adaptive_entropy:
-                # Update coefficient and cache entropy once per optimizer step, not per micro-batch.
-                # apply_coef uses the cached value so all micro-batches within one accumulation
-                # window apply the same bonus (using per-micro-batch world_entropy would cause
-                # the bonus to toggle on/off unpredictably across accumulation steps).
+                # Reduce sum and token count jointly for a true global mean (unbiased when ranks
+                # have different completion lengths). Update coefficient and cache entropy once per
+                # optimizer step; apply_coef uses the cached value so all micro-batches within one
+                # accumulation window apply the same bonus.
+                entropy_stats = self.accelerator.reduce(
+                    torch.stack([(entropies * mask).sum(), mask.sum()]).detach(), reduction="sum"
+                )
+                world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item()
                 if self.accelerator.sync_gradients:
                     if world_entropy <= self.args.entropy_target:
                         self.entropy_coef = min(
@@ -2807,7 +2804,6 @@ def _compute_loss(self, model, inputs):
             loss = loss - apply_coef * entropy_loss
 
             self._metrics[mode]["policy_loss"].append(self.accelerator.gather(policy_loss).nanmean().item())
-            self._metrics[mode]["entropy_loss"].append(world_entropy)
             # Log entropy_coef only on optimizer-step boundaries: it updates once per step (sync_gradients),
             # so logging K identical values per step would dilute the metric with stale data.
             if self.accelerator.sync_gradients:

From bee5126842614b1d3fbdf0df01df498d95ca5c3b Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 14:22:53 +0000
Subject: [PATCH 25/37] Gate on train mode to avoid entropy state update during
 eval

---
 trl/trainer/grpo_trainer.py | 44 +++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 21fadcc16b6..bebb43a4481 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2779,24 +2779,26 @@ def _compute_loss(self, model, inputs):
                 raise ValueError(f"Unknown loss type: {self.loss_type}")
 
             if self.use_adaptive_entropy:
-                # Reduce sum and token count jointly for a true global mean (unbiased when ranks
-                # have different completion lengths). Update coefficient and cache entropy once per
-                # optimizer step; apply_coef uses the cached value so all micro-batches within one
-                # accumulation window apply the same bonus.
-                entropy_stats = self.accelerator.reduce(
-                    torch.stack([(entropies * mask).sum(), mask.sum()]).detach(), reduction="sum"
-                )
-                world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item()
-                if self.accelerator.sync_gradients:
-                    if world_entropy <= self.args.entropy_target:
-                        self.entropy_coef = min(
-                            self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max
-                        )
-                    else:
-                        self.entropy_coef = max(
-                            self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min
-                        )
-                    self._last_world_entropy = world_entropy
+                if mode == "train":
+                    # Reduce sum and token count jointly for a true global mean (unbiased when ranks
+                    # have different completion lengths). Update coefficient and cache entropy once per
+                    # optimizer step; apply_coef uses the cached value so all micro-batches within one
+                    # accumulation window apply the same bonus. Gated on train mode so evaluation
+                    # cannot mutate the entropy controller state.
+                    entropy_stats = self.accelerator.reduce(
+                        torch.stack([(entropies * mask).sum(), mask.sum()]).detach(), reduction="sum"
+                    )
+                    world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item()
+                    if self.accelerator.sync_gradients:
+                        if world_entropy <= self.args.entropy_target:
+                            self.entropy_coef = min(
+                                self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max
+                            )
+                        else:
+                            self.entropy_coef = max(
+                                self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min
+                            )
+                        self._last_world_entropy = world_entropy
                 apply_coef = self.entropy_coef if self._last_world_entropy <= self.args.entropy_target else 0.0
             else:
                 apply_coef = self.entropy_coef
@@ -2804,9 +2806,9 @@ def _compute_loss(self, model, inputs):
             loss = loss - apply_coef * entropy_loss
 
             self._metrics[mode]["policy_loss"].append(self.accelerator.gather(policy_loss).nanmean().item())
-            # Log entropy_coef only on optimizer-step boundaries: it updates once per step (sync_gradients),
-            # so logging K identical values per step would dilute the metric with stale data.
-            if self.accelerator.sync_gradients:
+            # Log entropy_coef only on train optimizer-step boundaries: it updates once per step
+            # (sync_gradients), and sync_gradients is always True in eval (no accumulation context).
+            if mode == "train" and self.accelerator.sync_gradients:
                 self._metrics[mode]["entropy_coef"].append(self.entropy_coef)
 
         # The policy loss above is scaled for gradient accumulation (HF auto-scaling is off here), so scale aux too

From 2f34d156c5cc069fa5a2b7d58b1ce7b6b0c079b8 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 14:57:02 +0000
Subject: [PATCH 26/37] Fix entropy bonus ignores quantile mask

---
 trl/trainer/grpo_trainer.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index bebb43a4481..9b1a5a1c79e 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2764,17 +2764,22 @@ def _compute_loss(self, model, inputs):
         # when self.entropy_coef has been decremented to entropy_coef_min (default 0) so it can recover once entropy
         # drops below entropy_target again.
         if self.entropy_coef != 0.0 or self.use_adaptive_entropy:
+            # When top_entropy_quantile < 1.0, entropy_mask restricts policy gradients to high-entropy
+            # tokens. Use the same effective mask for the entropy bonus so it acts on the same tokens.
+            effective_mask = mask if entropy_mask is None else mask * entropy_mask
             if self.loss_type in ["grpo", "sapo"]:
-                entropy_loss = ((entropies * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() / normalizer
+                entropy_loss = ((entropies * effective_mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() / normalizer
             elif self.loss_type == "bnpo":
-                entropy_loss = (entropies * mask).sum() / mask.sum().clamp(min=1.0) / normalizer
+                entropy_loss = (entropies * effective_mask).sum() / mask.sum().clamp(min=1.0) / normalizer
             elif self.loss_type == "dr_grpo":
-                entropy_loss = (entropies * mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer
+                entropy_loss = (
+                    (entropies * effective_mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer
+                )
             elif self.loss_type in ["cispo", "dapo", "vespo"]:
-                entropy_loss = (entropies * mask).sum() / normalizer
+                entropy_loss = (entropies * effective_mask).sum() / normalizer
             elif self.loss_type == "luspo":
                 # luspo weights each sequence by its token count, so entropy is summed (not per-token averaged) per sequence
-                entropy_loss = (entropies * mask).sum(-1).mean() / normalizer
+                entropy_loss = (entropies * effective_mask).sum(-1).mean() / normalizer
             else:
                 raise ValueError(f"Unknown loss type: {self.loss_type}")
 

From 806078dda8370ee5479fcb5d50730ed3c5c826a4 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 15:05:48 +0000
Subject: [PATCH 27/37] Use effective_mask for the world_entropy all-reduce too

---
 trl/trainer/grpo_trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 9b1a5a1c79e..8107d5827b8 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2791,7 +2791,8 @@ def _compute_loss(self, model, inputs):
                     # accumulation window apply the same bonus. Gated on train mode so evaluation
                     # cannot mutate the entropy controller state.
                     entropy_stats = self.accelerator.reduce(
-                        torch.stack([(entropies * mask).sum(), mask.sum()]).detach(), reduction="sum"
+                        torch.stack([(entropies * effective_mask).sum(), effective_mask.sum()]).detach(),
+                        reduction="sum",
                     )
                     world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item()
                     if self.accelerator.sync_gradients:

From 2845ef455f3132a1caad88dc0bb6bca0a799f147 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 15:06:11 +0000
Subject: [PATCH 28/37] Update docs

---
 docs/source/grpo_trainer.md |  2 +-
 trl/trainer/grpo_config.py  | 14 +++++++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
index e6fb82f1f88..0cb882a6846 100644
--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@@ -677,7 +677,7 @@ training_args = GRPOConfig(
 
 <Tip>
 
-Typical language models have per-token entropies of 2–10 nats. The default `entropy_target=0.2` nearly always triggers regularization; set it to a value meaningful for your model (e.g. the entropy you observe early in training, logged as the `entropy` metric).
+Typical language models have per-token entropies of 2–10 nats. The default `entropy_target=0.2` nearly always triggers regularization; set it to a value meaningful for your model (e.g. the entropy you observe early in training, logged as the `entropy` metric). When using `top_entropy_quantile < 1.0`, `entropy_target` applies to the high-entropy token subset — that subset's entropy will be higher than the logged full-token `entropy`, so calibrate accordingly.
 
 </Tip>
 
diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py
index 230c8f50c10..a83dd7c0349 100644
--- a/trl/trainer/grpo_config.py
+++ b/trl/trainer/grpo_config.py
@@ -314,9 +314,11 @@ class GRPOConfig(_BaseConfig):
             Step size for adjusting the entropy coefficient at each optimizer step during adaptive entropy control.
         entropy_target (`float`, *optional*, defaults to `0.2`):
             Target mean per-token entropy (in nats) used by adaptive entropy control. The coefficient is only
-            applied when the current entropy falls at or below this value. Typical language models have per-token
-            entropies in the range 2–10 nats; the default of `0.2` nearly always triggers regularization, so users
-            should tune this to a value appropriate for their model and task.
+            applied when the current entropy falls at or below this value. Measured over the same token set as
+            the policy loss: all completion tokens by default, or only the high-entropy subset when
+            `top_entropy_quantile < 1.0`. Typical language models have per-token entropies in the range 2–10
+            nats; the default of `0.2` nearly always triggers regularization, so users should tune this to a
+            value appropriate for their model and task (and token subset when using `top_entropy_quantile`).
         max_tool_calling_iterations (`int`, *optional*):
             Maximum number of tool-calling turns when training an agent. If `None`, there is no limit and generation
             stops when the model generates a response turn with no tool calls or when the total response length reaches
@@ -886,8 +888,10 @@ class GRPOConfig(_BaseConfig):
         default=0.2,
         metadata={
             "help": "Target mean per-token entropy (nats) for adaptive entropy control. The coefficient is only "
-            "applied when current entropy is at or below this value. Typical language models have per-token "
-            "entropies of 2–10 nats; the default of 0.2 nearly always triggers regularization, so tune this."
+            "applied when current entropy is at or below this value. Measured over the same token set as the "
+            "policy loss (all completion tokens, or the high-entropy subset when top_entropy_quantile < 1.0). "
+            "Typical language models have per-token entropies of 2–10 nats; the default of 0.2 nearly always "
+            "triggers regularization, so tune this."
         },
     )
     max_tool_calling_iterations: int | None = field(

From 2ed11c0a7d9f43d65364dd1b20a05a66f0c4d87b Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 24 Jun 2026 15:25:11 +0000
Subject: [PATCH 29/37] Use unified formula with mean per-token entropy of
 active tokens

---
 trl/trainer/grpo_trainer.py | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 8107d5827b8..305fcbb375e 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2767,21 +2767,10 @@ def _compute_loss(self, model, inputs):
             # When top_entropy_quantile < 1.0, entropy_mask restricts policy gradients to high-entropy
             # tokens. Use the same effective mask for the entropy bonus so it acts on the same tokens.
             effective_mask = mask if entropy_mask is None else mask * entropy_mask
-            if self.loss_type in ["grpo", "sapo"]:
-                entropy_loss = ((entropies * effective_mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() / normalizer
-            elif self.loss_type == "bnpo":
-                entropy_loss = (entropies * effective_mask).sum() / mask.sum().clamp(min=1.0) / normalizer
-            elif self.loss_type == "dr_grpo":
-                entropy_loss = (
-                    (entropies * effective_mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer
-                )
-            elif self.loss_type in ["cispo", "dapo", "vespo"]:
-                entropy_loss = (entropies * effective_mask).sum() / normalizer
-            elif self.loss_type == "luspo":
-                # luspo weights each sequence by its token count, so entropy is summed (not per-token averaged) per sequence
-                entropy_loss = (entropies * effective_mask).sum(-1).mean() / normalizer
-            else:
-                raise ValueError(f"Unknown loss type: {self.loss_type}")
+            # Mean per-active-token entropy, scaled for gradient accumulation like the policy loss.
+            # Uniform across all loss types so entropy_target and entropy_coef have consistent units
+            # (per-token nats) and match the world_entropy computed in the adaptive block below.
+            entropy_loss = (entropies * effective_mask).sum() / effective_mask.sum().clamp(min=1.0) / normalizer
 
             if self.use_adaptive_entropy:
                 if mode == "train":

From 76255d3016e79ceace966c6bf50e6e3ff60b448f Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 25 Jun 2026 14:52:43 +0200
Subject: [PATCH 30/37] Make three-branch entropy-loss split

---
 trl/trainer/grpo_trainer.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 50e8337e77b..280ea1eb446 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2767,10 +2767,21 @@ def _compute_loss(self, model, inputs):
             # When top_entropy_quantile < 1.0, entropy_mask restricts policy gradients to high-entropy
             # tokens. Use the same effective mask for the entropy bonus so it acts on the same tokens.
             effective_mask = mask if entropy_mask is None else mask * entropy_mask
-            # Mean per-active-token entropy, scaled for gradient accumulation like the policy loss.
-            # Uniform across all loss types so entropy_target and entropy_coef have consistent units
-            # (per-token nats) and match the world_entropy computed in the adaptive block below.
-            entropy_loss = (entropies * effective_mask).sum() / effective_mask.sum().clamp(min=1.0) / normalizer
+            # The entropy bonus must be normalized exactly like each loss type's policy loss, so that
+            # entropy_coef stays on a consistent scale and gradient accumulation remains correct. The
+            # normalizer differs by loss type: it is the gradient accumulation step count for the grpo
+            # family, but a global token count for the cispo/dapo/vespo family.
+            if self.loss_type in ["cispo", "dapo", "vespo"]:
+                # normalizer is a global token count, so summing the entropies accumulates over the
+                # optimizer step to the global token-weighted mean entropy, matching world_entropy below.
+                entropy_loss = (entropies * effective_mask).sum() / normalizer
+            elif self.loss_type == "luspo":
+                # luspo weights each sequence by its token count, so entropy is summed per sequence (not
+                # per-token averaged) to stay on the same scale as the policy loss.
+                entropy_loss = (entropies * effective_mask).sum(-1).mean() / normalizer
+            else:  # grpo, sapo, bnpo, dr_grpo: normalizer is the gradient accumulation step count
+                # Token-weighted mean entropy of active tokens, matching world_entropy below.
+                entropy_loss = (entropies * effective_mask).sum() / effective_mask.sum().clamp(min=1.0) / normalizer
 
             if self.use_adaptive_entropy:
                 if mode == "train":

From fc76d4b497dab5310be5b222945162851041fb18 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 25 Jun 2026 14:53:49 +0200
Subject: [PATCH 31/37] Compute bonus from frozen state, update per optimizer
 step

---
 trl/trainer/grpo_trainer.py | 51 ++++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 23 deletions(-)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 280ea1eb446..886e9064a87 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2783,28 +2783,10 @@ def _compute_loss(self, model, inputs):
                 # Token-weighted mean entropy of active tokens, matching world_entropy below.
                 entropy_loss = (entropies * effective_mask).sum() / effective_mask.sum().clamp(min=1.0) / normalizer
 
+            # Apply the coefficient and gating from the end of the previous optimizer step, so that every
+            # micro-batch in the current accumulation window applies the same entropy bonus. The adaptive
+            # update below only takes effect on the next step.
             if self.use_adaptive_entropy:
-                if mode == "train":
-                    # Reduce sum and token count jointly for a true global mean (unbiased when ranks
-                    # have different completion lengths). Update coefficient and cache entropy once per
-                    # optimizer step; apply_coef uses the cached value so all micro-batches within one
-                    # accumulation window apply the same bonus. Gated on train mode so evaluation
-                    # cannot mutate the entropy controller state.
-                    entropy_stats = self.accelerator.reduce(
-                        torch.stack([(entropies * effective_mask).sum(), effective_mask.sum()]).detach(),
-                        reduction="sum",
-                    )
-                    world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item()
-                    if self.accelerator.sync_gradients:
-                        if world_entropy <= self.args.entropy_target:
-                            self.entropy_coef = min(
-                                self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max
-                            )
-                        else:
-                            self.entropy_coef = max(
-                                self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min
-                            )
-                        self._last_world_entropy = world_entropy
                 apply_coef = self.entropy_coef if self._last_world_entropy <= self.args.entropy_target else 0.0
             else:
                 apply_coef = self.entropy_coef
@@ -2812,8 +2794,31 @@ def _compute_loss(self, model, inputs):
             loss = loss - apply_coef * entropy_loss
 
             self._metrics[mode]["policy_loss"].append(self.accelerator.gather(policy_loss).nanmean().item())
-            # Log entropy_coef only on train optimizer-step boundaries: it updates once per step
-            # (sync_gradients), and sync_gradients is always True in eval (no accumulation context).
+
+            # Adaptive update: once per optimizer step, measure the global token-weighted entropy and adjust
+            # the coefficient for the next step. Gated on train mode so evaluation cannot mutate the entropy
+            # controller state, and on sync_gradients so the all-reduce runs once per optimizer step rather
+            # than on every micro-batch of the accumulation window.
+            if self.use_adaptive_entropy and mode == "train" and self.accelerator.sync_gradients:
+                # Reduce sum and token count jointly for a true global mean (unbiased when ranks have
+                # different completion lengths).
+                entropy_stats = self.accelerator.reduce(
+                    torch.stack([(entropies * effective_mask).sum(), effective_mask.sum()]).detach(),
+                    reduction="sum",
+                )
+                world_entropy = (entropy_stats[0] / entropy_stats[1].clamp(min=1.0)).item()
+                if world_entropy <= self.args.entropy_target:
+                    self.entropy_coef = min(
+                        self.entropy_coef + self.args.entropy_coef_delta, self.args.entropy_coef_max
+                    )
+                else:
+                    self.entropy_coef = max(
+                        self.entropy_coef - self.args.entropy_coef_delta, self.args.entropy_coef_min
+                    )
+                self._last_world_entropy = world_entropy
+
+            # Log entropy_coef on train optimizer-step boundaries (constant for static control; updated just
+            # above for adaptive control). sync_gradients is always True in eval (no accumulation context).
             if mode == "train" and self.accelerator.sync_gradients:
                 self._metrics[mode]["entropy_coef"].append(self.entropy_coef)
 

From bed5188839e63bfac5deefd74b7fb39eced5f770 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 25 Jun 2026 15:00:44 +0200
Subject: [PATCH 32/37] Fix "nearly always triggers" docs

---
 docs/source/grpo_trainer.md |  2 +-
 trl/trainer/grpo_config.py  | 11 +++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
index 0cb882a6846..c05cbd389b7 100644
--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@@ -677,7 +677,7 @@ training_args = GRPOConfig(
 
 <Tip>
 
-Typical language models have per-token entropies of 2–10 nats. The default `entropy_target=0.2` nearly always triggers regularization; set it to a value meaningful for your model (e.g. the entropy you observe early in training, logged as the `entropy` metric). When using `top_entropy_quantile < 1.0`, `entropy_target` applies to the high-entropy token subset — that subset's entropy will be higher than the logged full-token `entropy`, so calibrate accordingly.
+Typical language models have per-token entropies of 2–10 nats, so the default `entropy_target=0.2` almost never triggers regularization — the bonus only engages once entropy is at or below the target, i.e. near-complete collapse. Set it to a value meaningful for your model, e.g. close to the entropy you observe early in training (logged as the `entropy` metric). When using `top_entropy_quantile < 1.0`, `entropy_target` applies to the high-entropy token subset — that subset's entropy will be higher than the logged full-token `entropy`, so calibrate accordingly.
 
 </Tip>
 
diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py
index a83dd7c0349..5c469aecfdc 100644
--- a/trl/trainer/grpo_config.py
+++ b/trl/trainer/grpo_config.py
@@ -317,8 +317,10 @@ class GRPOConfig(_BaseConfig):
             applied when the current entropy falls at or below this value. Measured over the same token set as
             the policy loss: all completion tokens by default, or only the high-entropy subset when
             `top_entropy_quantile < 1.0`. Typical language models have per-token entropies in the range 2–10
-            nats; the default of `0.2` nearly always triggers regularization, so users should tune this to a
-            value appropriate for their model and task (and token subset when using `top_entropy_quantile`).
+            nats, so the default of `0.2` almost never triggers regularization (only on near-complete entropy
+            collapse); set it close to the entropy you observe early in training (logged as the `entropy`
+            metric) so the bonus engages before the policy collapses (and account for the token subset when
+            using `top_entropy_quantile`).
         max_tool_calling_iterations (`int`, *optional*):
             Maximum number of tool-calling turns when training an agent. If `None`, there is no limit and generation
             stops when the model generates a response turn with no tool calls or when the total response length reaches
@@ -890,8 +892,9 @@ class GRPOConfig(_BaseConfig):
             "help": "Target mean per-token entropy (nats) for adaptive entropy control. The coefficient is only "
             "applied when current entropy is at or below this value. Measured over the same token set as the "
             "policy loss (all completion tokens, or the high-entropy subset when top_entropy_quantile < 1.0). "
-            "Typical language models have per-token entropies of 2–10 nats; the default of 0.2 nearly always "
-            "triggers regularization, so tune this."
+            "Typical language models have per-token entropies of 2–10 nats, so the default of 0.2 almost never "
+            "triggers regularization (only on near-complete collapse); set it close to the entropy observed "
+            "early in training and tune from there."
         },
     )
     max_tool_calling_iterations: int | None = field(

From 6e8f498ed10927d40131bfd5e6d4be587bdc23bd Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 25 Jun 2026 15:02:28 +0200
Subject: [PATCH 33/37] Add scale test and grad-accumulation adaptive test

---
 tests/test_grpo_trainer.py | 87 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py
index 29f07aff5ff..8c816bcb33a 100644
--- a/tests/test_grpo_trainer.py
+++ b/tests/test_grpo_trainer.py
@@ -1540,6 +1540,93 @@ def test_train_with_adaptive_entropy(self):
             new_param = trainer.model.get_parameter(n)
             assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
 
+    @pytest.mark.parametrize("loss_type", ["grpo", "dapo", "luspo"])
+    def test_entropy_bonus_scale(self, loss_type):
+        # Regression test: the entropy bonus must be normalized like each loss type's policy loss. A previous
+        # "unified" formula divided the per-token mean entropy by the loss normalizer, which for the
+        # cispo/dapo/vespo family is a global token count, making the bonus ~1/sequence_length too small; and
+        # it put luspo's bonus on the per-token scale instead of luspo's sequence-weighted scale. With
+        # gradient_accumulation_steps=1 the per-step entropy contribution to the loss is
+        # contrib = policy_loss - loss = entropy_coef * entropy_loss, so contrib / entropy reveals the scale.
+        entropy_coef = 0.5
+        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+        training_args = GRPOConfig(
+            output_dir=self.tmp_dir,
+            importance_sampling_level="sequence" if loss_type == "luspo" else "token",
+            learning_rate=0.1,  # use higher lr because gradients are tiny and default lr can stall updates
+            per_device_train_batch_size=3,  # reduce the batch size to reduce memory usage
+            num_generations=3,  # reduce the number of generations to reduce memory usage
+            max_completion_length=16,  # long enough that the per-token vs sequence-weighted scales differ
+            gradient_accumulation_steps=1,  # so contrib == entropy_coef * entropy_loss holds per step
+            loss_type=loss_type,
+            logging_steps=1,
+            report_to="none",
+            entropy_coef=entropy_coef,
+        )
+        trainer = GRPOTrainer(
+            model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+            reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+            args=training_args,
+            train_dataset=dataset,
+        )
+
+        trainer.train()
+
+        logs = [h for h in trainer.state.log_history if "policy_loss" in h and "loss" in h and h.get("entropy")]
+        assert logs
+        ratios = sorted((h["policy_loss"] - h["loss"]) / h["entropy"] for h in logs)
+        ratio = ratios[len(ratios) // 2]  # median, robust to per-step noise
+        if loss_type == "luspo":
+            # luspo weights each sequence by its length, so the bonus is the per-sequence entropy sum: its
+            # scale is entropy_coef * (mean sequence length), well above entropy_coef. The buggy per-token
+            # formula gave ratio == entropy_coef.
+            assert ratio > 1.5 * entropy_coef
+        else:
+            # grpo (and the cispo/dapo/vespo family) regularize the per-token mean entropy, so the bonus is
+            # exactly entropy_coef * entropy. The buggy formula made dapo's ratio smaller by ~1/seq_len.
+            assert ratio == pytest.approx(entropy_coef, rel=0.3)
+
+    def test_train_with_adaptive_entropy_gradient_accumulation(self):
+        # Adaptive entropy must behave correctly under gradient accumulation: the coefficient and gating are
+        # frozen across an accumulation window and the controller updates once per optimizer step (not once
+        # per micro-batch). With entropy_target above any realistic entropy the coefficient is incremented by
+        # entropy_coef_delta on every optimizer step, so the final value pins down the number of updates.
+        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+        training_args = GRPOConfig(
+            output_dir=self.tmp_dir,
+            learning_rate=0.1,  # use higher lr because gradients are tiny and default lr can stall updates
+            per_device_train_batch_size=3,  # reduce the batch size to reduce memory usage
+            num_generations=3,  # reduce the number of generations to reduce memory usage
+            max_completion_length=8,  # reduce the completion length to reduce memory usage
+            gradient_accumulation_steps=2,  # exercise the accumulation window
+            report_to="none",
+            entropy_coef=0.01,
+            use_adaptive_entropy=True,
+            entropy_target=15.0,  # above any realistic entropy → coef incremented once per optimizer step
+            entropy_coef_delta=0.005,
+        )
+        trainer = GRPOTrainer(
+            model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+            reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+            args=training_args,
+            train_dataset=dataset,
+        )
+
+        previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+        trainer.train()
+
+        assert trainer.state.log_history[-1]["train_loss"] is not None
+        # Exactly one increment per optimizer step (global_step counts optimizer steps, not micro-batches);
+        # a per-micro-batch update would overshoot this.
+        expected_coef = min(0.01 + 0.005 * trainer.state.global_step, 1.0)
+        assert trainer.entropy_coef == pytest.approx(expected_coef, abs=1e-6)
+
+        # Check that the params have changed
+        for n, param in previous_trainable_params.items():
+            new_param = trainer.model.get_parameter(n)
+            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+
     def test_train_with_entropy_filter(self):
         dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
         training_args = GRPOConfig(

From 607d911d95cfcdd631f8164fb307b4ca433a7a6c Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 25 Jun 2026 15:47:33 +0200
Subject: [PATCH 34/37] Fix dr_grpo entropy scale mismatch

---
 trl/trainer/grpo_trainer.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 886e9064a87..c2e4b211c8c 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2775,11 +2775,18 @@ def _compute_loss(self, model, inputs):
                 # normalizer is a global token count, so summing the entropies accumulates over the
                 # optimizer step to the global token-weighted mean entropy, matching world_entropy below.
                 entropy_loss = (entropies * effective_mask).sum() / normalizer
+            elif self.loss_type == "dr_grpo":
+                # Dr. GRPO normalizes by the fixed budget (batch size × max completion length) instead of the
+                # actual token count, to remove length bias; scale the entropy bonus the same way so that
+                # entropy_coef stays consistent with the policy term when completions are shorter than the max.
+                entropy_loss = (
+                    (entropies * effective_mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer
+                )
             elif self.loss_type == "luspo":
                 # luspo weights each sequence by its token count, so entropy is summed per sequence (not
                 # per-token averaged) to stay on the same scale as the policy loss.
                 entropy_loss = (entropies * effective_mask).sum(-1).mean() / normalizer
-            else:  # grpo, sapo, bnpo, dr_grpo: normalizer is the gradient accumulation step count
+            else:  # grpo, sapo, bnpo: normalizer is the gradient accumulation step count
                 # Token-weighted mean entropy of active tokens, matching world_entropy below.
                 entropy_loss = (entropies * effective_mask).sum() / effective_mask.sum().clamp(min=1.0) / normalizer
 

From 0cfad37a15b4eeb45a888f7b428b74999c4c934d Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 25 Jun 2026 16:07:55 +0200
Subject: [PATCH 35/37] Accumulate to mean per-token entropy, independent of
 how each loss type normalizes

---
 trl/trainer/grpo_trainer.py | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index c2e4b211c8c..ead9707d764 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -2767,27 +2767,18 @@ def _compute_loss(self, model, inputs):
             # When top_entropy_quantile < 1.0, entropy_mask restricts policy gradients to high-entropy
             # tokens. Use the same effective mask for the entropy bonus so it acts on the same tokens.
             effective_mask = mask if entropy_mask is None else mask * entropy_mask
-            # The entropy bonus must be normalized exactly like each loss type's policy loss, so that
-            # entropy_coef stays on a consistent scale and gradient accumulation remains correct. The
-            # normalizer differs by loss type: it is the gradient accumulation step count for the grpo
-            # family, but a global token count for the cispo/dapo/vespo family.
+            # Entropy bonus = mean per-token entropy H (the documented objective L = L_policy - coef * H), so
+            # H does not depend on how each loss type normalizes its policy term. The term is computed so that
+            # it accumulates to H over the optimizer step for every loss type and matches world_entropy below.
+            # The only wrinkle is the normalizer: most loss types divide by the gradient accumulation step
+            # count, but cispo/dapo/vespo divide by a global token count.
             if self.loss_type in ["cispo", "dapo", "vespo"]:
-                # normalizer is a global token count, so summing the entropies accumulates over the
-                # optimizer step to the global token-weighted mean entropy, matching world_entropy below.
+                # normalizer is a global token count, so summing the entropies (instead of averaging them
+                # again) makes the term accumulate over the optimizer step to the global mean per-token
+                # entropy, like the other loss types.
                 entropy_loss = (entropies * effective_mask).sum() / normalizer
-            elif self.loss_type == "dr_grpo":
-                # Dr. GRPO normalizes by the fixed budget (batch size × max completion length) instead of the
-                # actual token count, to remove length bias; scale the entropy bonus the same way so that
-                # entropy_coef stays consistent with the policy term when completions are shorter than the max.
-                entropy_loss = (
-                    (entropies * effective_mask).sum() / (entropies.size(0) * self.max_completion_length) / normalizer
-                )
-            elif self.loss_type == "luspo":
-                # luspo weights each sequence by its token count, so entropy is summed per sequence (not
-                # per-token averaged) to stay on the same scale as the policy loss.
-                entropy_loss = (entropies * effective_mask).sum(-1).mean() / normalizer
-            else:  # grpo, sapo, bnpo: normalizer is the gradient accumulation step count
-                # Token-weighted mean entropy of active tokens, matching world_entropy below.
+            else:
+                # Mean per-token entropy of active tokens, scaled for gradient accumulation.
                 entropy_loss = (entropies * effective_mask).sum() / effective_mask.sum().clamp(min=1.0) / normalizer
 
             # Apply the coefficient and gating from the end of the previous optimizer step, so that every

From 8e05132281547ac1222936c68479528176fcaa3f Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 25 Jun 2026 16:08:14 +0200
Subject: [PATCH 36/37] Update tests

---
 tests/test_grpo_trainer.py | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py
index 8c816bcb33a..875ee9b3b78 100644
--- a/tests/test_grpo_trainer.py
+++ b/tests/test_grpo_trainer.py
@@ -1540,14 +1540,16 @@ def test_train_with_adaptive_entropy(self):
             new_param = trainer.model.get_parameter(n)
             assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
 
-    @pytest.mark.parametrize("loss_type", ["grpo", "dapo", "luspo"])
+    @pytest.mark.parametrize("loss_type", ["grpo", "dr_grpo", "dapo", "luspo"])
     def test_entropy_bonus_scale(self, loss_type):
-        # Regression test: the entropy bonus must be normalized like each loss type's policy loss. A previous
-        # "unified" formula divided the per-token mean entropy by the loss normalizer, which for the
-        # cispo/dapo/vespo family is a global token count, making the bonus ~1/sequence_length too small; and
-        # it put luspo's bonus on the per-token scale instead of luspo's sequence-weighted scale. With
-        # gradient_accumulation_steps=1 the per-step entropy contribution to the loss is
-        # contrib = policy_loss - loss = entropy_coef * entropy_loss, so contrib / entropy reveals the scale.
+        # Regression test: the entropy bonus is the mean per-token entropy H for every loss type (documented
+        # objective L = L_policy - entropy_coef * H), so it must not inherit any loss-type-specific policy
+        # normalization. A previous "unified" formula divided H by a global token count for the
+        # cispo/dapo/vespo family, making the bonus ~1/sequence_length too small; conversely, scaling the
+        # bonus like the dr_grpo (fixed budget) or luspo (sequence-weighted) policy term would also be wrong.
+        # With gradient_accumulation_steps=1 the per-step entropy contribution to the loss is
+        # contrib = policy_loss - loss = entropy_coef * entropy_loss, so contrib / entropy must equal
+        # entropy_coef for all loss types.
         entropy_coef = 0.5
         dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
         training_args = GRPOConfig(
@@ -1556,7 +1558,7 @@ def test_entropy_bonus_scale(self, loss_type):
             learning_rate=0.1,  # use higher lr because gradients are tiny and default lr can stall updates
             per_device_train_batch_size=3,  # reduce the batch size to reduce memory usage
             num_generations=3,  # reduce the number of generations to reduce memory usage
-            max_completion_length=16,  # long enough that the per-token vs sequence-weighted scales differ
+            max_completion_length=16,  # reduce the completion length to reduce memory usage
             gradient_accumulation_steps=1,  # so contrib == entropy_coef * entropy_loss holds per step
             loss_type=loss_type,
             logging_steps=1,
@@ -1576,15 +1578,8 @@ def test_entropy_bonus_scale(self, loss_type):
         assert logs
         ratios = sorted((h["policy_loss"] - h["loss"]) / h["entropy"] for h in logs)
         ratio = ratios[len(ratios) // 2]  # median, robust to per-step noise
-        if loss_type == "luspo":
-            # luspo weights each sequence by its length, so the bonus is the per-sequence entropy sum: its
-            # scale is entropy_coef * (mean sequence length), well above entropy_coef. The buggy per-token
-            # formula gave ratio == entropy_coef.
-            assert ratio > 1.5 * entropy_coef
-        else:
-            # grpo (and the cispo/dapo/vespo family) regularize the per-token mean entropy, so the bonus is
-            # exactly entropy_coef * entropy. The buggy formula made dapo's ratio smaller by ~1/seq_len.
-            assert ratio == pytest.approx(entropy_coef, rel=0.3)
+        # Every loss type regularizes the mean per-token entropy, so contrib == entropy_coef * entropy.
+        assert ratio == pytest.approx(entropy_coef, rel=0.3)
 
     def test_train_with_adaptive_entropy_gradient_accumulation(self):
         # Adaptive entropy must behave correctly under gradient accumulation: the coefficient and gating are

From bccd8ebbf274edf12ef295c15e0f0e1ae578b306 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Fri, 26 Jun 2026 07:23:05 +0200
Subject: [PATCH 37/37] Add clarifying sentence

---
 docs/source/grpo_trainer.md | 2 +-
 trl/trainer/grpo_config.py  | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
index c05cbd389b7..b1e0050e1a6 100644
--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@@ -651,7 +651,7 @@ $$
 \mathcal{L}(\theta) = \mathcal{L}_{\text{GRPO}}(\theta) - \alpha \cdot \mathcal{H}(\pi_\theta),
 $$
 
-where \\(\mathcal{H}(\pi_\theta)\\) is the mean per-token entropy of the policy and \\(\alpha\\) is the entropy coefficient.
+where \\(\mathcal{H}(\pi_\theta)\\) is the mean per-token entropy of the policy and \\(\alpha\\) is the entropy coefficient. The bonus is always the mean per-token entropy regardless of `loss_type`; it is not rescaled to match a loss type's policy normalization (e.g. Dr. GRPO's `batch_size * max_completion_length` denominator), so `entropy_coef` has the same meaning for every loss type.
 
 **Static entropy** — a fixed coefficient throughout training:
 
diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py
index 5c469aecfdc..46fce0e13a9 100644
--- a/trl/trainer/grpo_config.py
+++ b/trl/trainer/grpo_config.py
@@ -297,7 +297,9 @@ class GRPOConfig(_BaseConfig):
             `mask_truncated_completions=True`, only tokens from non-truncated completions are considered.
         entropy_coef (`float`, *optional*, defaults to `0.0`):
             Coefficient of the entropy regularization term in the loss. A positive value adds an entropy bonus that
-            encourages exploration by keeping the policy from collapsing to near-deterministic outputs. When
+            encourages exploration by keeping the policy from collapsing to near-deterministic outputs. The bonus is
+            always the mean per-token entropy regardless of `loss_type`; it is not rescaled to match a loss type's
+            policy normalization, so `entropy_coef` has the same meaning for every loss type. When
             `use_adaptive_entropy=True`, this serves as the initial coefficient and is updated each optimizer step.
             Has no effect when set to `0.0` (default).
         use_adaptive_entropy (`bool`, *optional*, defaults to `False`):
@@ -862,7 +864,9 @@ class GRPOConfig(_BaseConfig):
         default=0.0,
         metadata={
             "help": "Coefficient of the entropy regularization term in the loss. A positive value adds an entropy "
-            "bonus that encourages exploration. When `use_adaptive_entropy=True`, this serves as the initial "
+            "bonus that encourages exploration. The bonus is always the mean per-token entropy regardless of "
+            "`loss_type` (not rescaled to a loss type's policy normalization), so `entropy_coef` has the same "
+            "meaning for every loss type. When `use_adaptive_entropy=True`, this serves as the initial "
             "coefficient and is updated each optimizer step. Has no effect when set to `0.0` (default)."
         },
     )