From 45d6a2decd6b170775f0fb762b110aa5f8adfab0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Wed, 24 Jun 2026 00:12:34 +0000
Subject: [PATCH 1/5] Add `quantization_config` trainer argument (streamline
 QLoRA)

---
 docs/source/peft_integration.md | 16 ++++++----------
 trl/scripts/dpo.py              |  8 ++------
 trl/scripts/grpo.py             |  8 ++------
 trl/scripts/reward.py           |  8 ++------
 trl/scripts/rloo.py             |  8 ++------
 trl/scripts/sft.py              |  8 ++------
 trl/trainer/dpo_trainer.py      | 17 +++++++++++++++++
 trl/trainer/grpo_trainer.py     | 17 +++++++++++++++++
 trl/trainer/reward_trainer.py   | 18 ++++++++++++++++++
 trl/trainer/rloo_trainer.py     | 17 +++++++++++++++++
 trl/trainer/sft_trainer.py      | 17 +++++++++++++++++
 11 files changed, 102 insertions(+), 40 deletions(-)

diff --git a/docs/source/peft_integration.md b/docs/source/peft_integration.md
index cfc5c180f9b..5eb660e99b8 100644
--- a/docs/source/peft_integration.md
+++ b/docs/source/peft_integration.md
@@ -449,11 +449,13 @@ python trl/scripts/sft.py \
 
 #### Python Example
 
+Pass the `quantization_config` directly to the trainer alongside `peft_config` — the trainer loads and quantizes the model for you. The same `quantization_config` argument is available on [`SFTTrainer`], [`DPOTrainer`], [`GRPOTrainer`], and [`RLOOTrainer`].
+
 ```python
 import torch
 
 from peft import LoraConfig
-from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+from transformers import BitsAndBytesConfig
 from trl import SFTConfig, SFTTrainer
 
 # Configure 4-bit quantization
@@ -464,13 +466,6 @@ bnb_config = BitsAndBytesConfig(
     bnb_4bit_use_double_quant=True,
 )
 
-# Load model with quantization
-model = AutoModelForCausalLM.from_pretrained(
-    "meta-llama/Llama-2-7b-hf",
-    quantization_config=bnb_config,
-    device_map="auto",
-)
-
 # Configure LoRA
 peft_config = LoraConfig(
     r=32,
@@ -486,11 +481,12 @@ training_args = SFTConfig(
     ...
 )
 
-# Create trainer with PEFT config
+# Create trainer with quantization and PEFT config
 trainer = SFTTrainer(
-    model=model,
+    model="meta-llama/Llama-2-7b-hf",
     args=training_args,
     train_dataset=dataset,
+    quantization_config=bnb_config,
     peft_config=peft_config,
 )
 
diff --git a/trl/scripts/dpo.py b/trl/scripts/dpo.py
index 5303906dcef..a73391716af 100644
--- a/trl/scripts/dpo.py
+++ b/trl/scripts/dpo.py
@@ -65,7 +65,7 @@ def main(script_args, training_args, model_args, dataset_args):
     from accelerate.logging import get_logger
     from datasets import load_dataset
 
-    from trl import DPOTrainer, get_dataset, get_kbit_device_map, get_peft_config, get_quantization_config
+    from trl import DPOTrainer, get_dataset, get_peft_config, get_quantization_config
 
     logger = get_logger(__name__)
 
@@ -75,11 +75,6 @@ def main(script_args, training_args, model_args, dataset_args):
         attn_implementation=model_args.attn_implementation,
         dtype=model_args.dtype,
     )
-    quantization_config = get_quantization_config(model_args)
-    if quantization_config is not None:
-        # Passing None would not be treated the same as omitting the argument, so we include it only when valid.
-        training_args.model_init_kwargs["device_map"] = get_kbit_device_map()
-        training_args.model_init_kwargs["quantization_config"] = quantization_config
 
     # Load the dataset
     if dataset_args.datasets and script_args.dataset_name:
@@ -103,6 +98,7 @@ def main(script_args, training_args, model_args, dataset_args):
         args=training_args,
         train_dataset=dataset[script_args.dataset_train_split],
         eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        quantization_config=get_quantization_config(model_args),
         peft_config=get_peft_config(model_args),
     )
 
diff --git a/trl/scripts/grpo.py b/trl/scripts/grpo.py
index fde837780c5..62d2aef142c 100644
--- a/trl/scripts/grpo.py
+++ b/trl/scripts/grpo.py
@@ -69,7 +69,7 @@ def main(script_args, training_args, model_args, dataset_args):
     from accelerate.logging import get_logger
     from datasets import load_dataset
 
-    from trl import GRPOTrainer, get_dataset, get_kbit_device_map, get_peft_config, get_quantization_config
+    from trl import GRPOTrainer, get_dataset, get_peft_config, get_quantization_config
     from trl.rewards import (
         accuracy_reward,
         get_soft_overlong_punishment,
@@ -113,11 +113,6 @@ def main(script_args, training_args, model_args, dataset_args):
         attn_implementation=model_args.attn_implementation,
         dtype=model_args.dtype,
     )
-    quantization_config = get_quantization_config(model_args)
-    if quantization_config is not None:
-        # Passing None would not be treated the same as omitting the argument, so we include it only when valid.
-        training_args.model_init_kwargs["device_map"] = get_kbit_device_map()
-        training_args.model_init_kwargs["quantization_config"] = quantization_config
 
     # Load the dataset
     if dataset_args.datasets and script_args.dataset_name:
@@ -142,6 +137,7 @@ def main(script_args, training_args, model_args, dataset_args):
         args=training_args,
         train_dataset=dataset[script_args.dataset_train_split],
         eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        quantization_config=get_quantization_config(model_args),
         peft_config=get_peft_config(model_args),
     )
 
diff --git a/trl/scripts/reward.py b/trl/scripts/reward.py
index 9cec8e9de84..6ee1dff063f 100644
--- a/trl/scripts/reward.py
+++ b/trl/scripts/reward.py
@@ -28,7 +28,7 @@ def main(script_args, training_args, model_args, dataset_args):
     from accelerate.logging import get_logger
     from datasets import load_dataset
 
-    from trl import RewardTrainer, get_dataset, get_kbit_device_map, get_peft_config, get_quantization_config
+    from trl import RewardTrainer, get_dataset, get_peft_config, get_quantization_config
 
     logger = get_logger(__name__)
 
@@ -38,11 +38,6 @@ def main(script_args, training_args, model_args, dataset_args):
         attn_implementation=model_args.attn_implementation,
         dtype=model_args.dtype,
     )
-    quantization_config = get_quantization_config(model_args)
-    if quantization_config is not None:
-        # Passing None would not be treated the same as omitting the argument, so we include it only when valid.
-        training_args.model_init_kwargs["device_map"] = get_kbit_device_map()
-        training_args.model_init_kwargs["quantization_config"] = quantization_config
 
     # Load the dataset
     if dataset_args.datasets and script_args.dataset_name:
@@ -66,6 +61,7 @@ def main(script_args, training_args, model_args, dataset_args):
         args=training_args,
         train_dataset=dataset[script_args.dataset_train_split],
         eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        quantization_config=get_quantization_config(model_args),
         peft_config=get_peft_config(model_args),
     )
 
diff --git a/trl/scripts/rloo.py b/trl/scripts/rloo.py
index 7ec6f380695..520872e3a76 100644
--- a/trl/scripts/rloo.py
+++ b/trl/scripts/rloo.py
@@ -69,7 +69,7 @@ def main(script_args, training_args, model_args, dataset_args):
     from accelerate.logging import get_logger
     from datasets import load_dataset
 
-    from trl import RLOOTrainer, get_dataset, get_kbit_device_map, get_peft_config, get_quantization_config
+    from trl import RLOOTrainer, get_dataset, get_peft_config, get_quantization_config
     from trl.rewards import (
         accuracy_reward,
         get_soft_overlong_punishment,
@@ -113,11 +113,6 @@ def main(script_args, training_args, model_args, dataset_args):
         attn_implementation=model_args.attn_implementation,
         dtype=model_args.dtype,
     )
-    quantization_config = get_quantization_config(model_args)
-    if quantization_config is not None:
-        # Passing None would not be treated the same as omitting the argument, so we include it only when valid.
-        training_args.model_init_kwargs["device_map"] = get_kbit_device_map()
-        training_args.model_init_kwargs["quantization_config"] = quantization_config
 
     # Load the dataset
     if dataset_args.datasets and script_args.dataset_name:
@@ -142,6 +137,7 @@ def main(script_args, training_args, model_args, dataset_args):
         args=training_args,
         train_dataset=dataset[script_args.dataset_train_split],
         eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        quantization_config=get_quantization_config(model_args),
         peft_config=get_peft_config(model_args),
     )
 
diff --git a/trl/scripts/sft.py b/trl/scripts/sft.py
index 82c9622cb0d..760b9683bd6 100644
--- a/trl/scripts/sft.py
+++ b/trl/scripts/sft.py
@@ -67,7 +67,7 @@ def main(script_args, training_args, model_args, dataset_args):
     from accelerate.logging import get_logger
     from datasets import load_dataset
 
-    from trl import SFTTrainer, get_dataset, get_kbit_device_map, get_peft_config, get_quantization_config
+    from trl import SFTTrainer, get_dataset, get_peft_config, get_quantization_config
 
     logger = get_logger(__name__)
 
@@ -77,11 +77,6 @@ def main(script_args, training_args, model_args, dataset_args):
         attn_implementation=model_args.attn_implementation,
         dtype=model_args.dtype,
     )
-    quantization_config = get_quantization_config(model_args)
-    if quantization_config is not None:
-        # Passing None would not be treated the same as omitting the argument, so we include it only when valid.
-        training_args.model_init_kwargs["device_map"] = get_kbit_device_map()
-        training_args.model_init_kwargs["quantization_config"] = quantization_config
 
     # Load the dataset
     if dataset_args.datasets and script_args.dataset_name:
@@ -105,6 +100,7 @@ def main(script_args, training_args, model_args, dataset_args):
         args=training_args,
         train_dataset=dataset[script_args.dataset_train_split],
         eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        quantization_config=get_quantization_config(model_args),
         peft_config=get_peft_config(model_args),
     )
 
diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py
index 2d58d09cc17..58f3ff1f03c 100644
--- a/trl/trainer/dpo_trainer.py
+++ b/trl/trainer/dpo_trainer.py
@@ -34,6 +34,7 @@
 from torch.utils.data import DataLoader
 from transformers import (
     AutoProcessor,
+    BitsAndBytesConfig,
     DataCollator,
     PreTrainedModel,
     PreTrainedTokenizerBase,
@@ -478,6 +479,9 @@ class DPOTrainer(_BaseTrainer):
         optimizers (`tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None]`, *optional*, defaults to `(None, None)`):
             A tuple containing the optimizer and the scheduler to use. Will default to an instance of `AdamW` on your
             model and a scheduler given by [`~transformers.get_linear_schedule_with_warmup`] controlled by `args`.
+        quantization_config ([`~transformers.BitsAndBytesConfig`], *optional*):
+            Quantization configuration used when loading the model from a model identifier. Combine with `peft_config`
+            for QLoRA training. Ignored if the model is already instantiated.
         peft_config ([`~peft.PeftConfig`], *optional*):
             PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
     """
@@ -511,6 +515,7 @@ def __init__(
         compute_metrics: Callable[[EvalPrediction], dict] | None = None,
         callbacks: list[TrainerCallback] | None = None,
         optimizers: tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None] = (None, None),
+        quantization_config: "BitsAndBytesConfig | None" = None,
         peft_config: "PeftConfig | None" = None,
     ):
         # Args
@@ -535,6 +540,13 @@ def __init__(
         # Model
         if isinstance(model, str):
             model_init_kwargs = args.model_init_kwargs or {}
+            if quantization_config is not None:
+                if "quantization_config" in model_init_kwargs:
+                    raise ValueError(
+                        "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. "
+                        "Please set it in only one place."
+                    )
+                model_init_kwargs["quantization_config"] = quantization_config
             # Distributed training requires device_map=None ("auto" fails)
             if args.distributed_state.distributed_type in ["MULTI_GPU", "DEEPSPEED"]:
                 model_init_kwargs["device_map"] = None
@@ -546,6 +558,11 @@ def __init__(
                     "You passed `model_init_kwargs` to the `DPOConfig`, but your model is already instantiated. "
                     "The `model_init_kwargs` will be ignored."
                 )
+            if quantization_config is not None:
+                logger.warning(
+                    "You passed `quantization_config` to the trainer, but your model is already instantiated. The "
+                    "`quantization_config` will be ignored."
+                )
         # Non-quantized models do not have the `is_loaded_in_{8,4}bit` attributes, whereas quantized models do
         _is_quantized_model = getattr(model, "is_loaded_in_4bit", False) or getattr(model, "is_loaded_in_8bit", False)
         if ref_model is model:
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 2bdc363a945..ded3e2e0f86 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -45,6 +45,7 @@
     AutoModelForSequenceClassification,
     AutoProcessor,
     AutoTokenizer,
+    BitsAndBytesConfig,
     GenerationConfig,
     PreTrainedModel,
     PreTrainedTokenizerBase,
@@ -226,6 +227,9 @@ class GRPOTrainer(_BaseTrainer):
         optimizers (`tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None]`, *optional*, defaults to `(None, None)`):
             A tuple containing the optimizer and the scheduler to use. Will default to an instance of `AdamW` on your
             model and a scheduler given by [`~transformers.get_linear_schedule_with_warmup`] controlled by `args`.
+        quantization_config ([`~transformers.BitsAndBytesConfig`], *optional*):
+            Quantization configuration used when loading the model from a model identifier. Combine with `peft_config`
+            for QLoRA training. Ignored if the model is already instantiated.
         peft_config ([`~peft.PeftConfig`], *optional*):
             PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
         tools (list of `Callable`, *optional*):
@@ -280,6 +284,7 @@ def __init__(
         reward_processing_classes: PreTrainedTokenizerBase | list[PreTrainedTokenizerBase] | None = None,
         callbacks: list[TrainerCallback] | None = None,
         optimizers: tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None] = (None, None),
+        quantization_config: "BitsAndBytesConfig | None" = None,
         peft_config: "PeftConfig | None" = None,
         tools: list[Callable] | None = None,
         rollout_func: RolloutFunc | None = None,
@@ -294,6 +299,13 @@ def __init__(
         # Model
         if isinstance(model, str):
             model_init_kwargs = args.model_init_kwargs or {}
+            if quantization_config is not None:
+                if "quantization_config" in model_init_kwargs:
+                    raise ValueError(
+                        "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. "
+                        "Please set it in only one place."
+                    )
+                model_init_kwargs["quantization_config"] = quantization_config
             # Distributed training requires device_map=None ("auto" fails)
             if args.distributed_state.distributed_type in ["MULTI_GPU", "DEEPSPEED"]:
                 model_init_kwargs["device_map"] = None
@@ -305,6 +317,11 @@ def __init__(
                     "You passed `model_init_kwargs` to the `GRPOConfig`, but your model is already instantiated. "
                     "The `model_init_kwargs` will be ignored."
                 )
+            if quantization_config is not None:
+                logger.warning(
+                    "You passed `quantization_config` to the trainer, but your model is already instantiated. The "
+                    "`quantization_config` will be ignored."
+                )
         # Non-quantized models do not have the `is_loaded_in_{8,4}bit` attributes, whereas quantized models do
         _is_quantized_model = getattr(model, "is_loaded_in_4bit", False) or getattr(model, "is_loaded_in_8bit", False)
 
diff --git a/trl/trainer/reward_trainer.py b/trl/trainer/reward_trainer.py
index cfa6254f97c..fba474b1f79 100644
--- a/trl/trainer/reward_trainer.py
+++ b/trl/trainer/reward_trainer.py
@@ -36,6 +36,7 @@
 from transformers import (
     AutoModelForSequenceClassification,
     AutoTokenizer,
+    BitsAndBytesConfig,
     DataCollator,
     PreTrainedModel,
     PreTrainedTokenizerBase,
@@ -309,6 +310,10 @@ class RewardTrainer(_BaseTrainer):
             by this function will be reflected in the predictions received by `compute_metrics`.
 
             Note that the labels (second parameter) will be `None` if the dataset does not have them.
+        quantization_config ([`~transformers.BitsAndBytesConfig`], *optional*):
+            Quantization configuration used when loading the model from a model identifier. Combine with `peft_config`
+            for QLoRA training. Ignored if the model is already instantiated, or if `quantization_config` is also set in
+            `args.model_init_kwargs`.
         peft_config ([`~peft.PeftConfig`], *optional*):
             PEFT configuration used to wrap the model. If `None`, the model is not wrapped. Note that if the loaded
             model is a causal LM, it's highly recommended to set `modules_to_save=["score"]` in the PEFT configuration
@@ -332,6 +337,7 @@ def __init__(
         optimizers: tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None] = (None, None),
         optimizer_cls_and_kwargs: tuple[type[torch.optim.Optimizer], dict[str, Any]] | None = None,
         preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
+        quantization_config: "BitsAndBytesConfig | None" = None,
         peft_config: "PeftConfig | None" = None,
     ):
         # Args
@@ -359,6 +365,13 @@ def __init__(
         set_seed(args.seed)
         if isinstance(model, str):
             model_init_kwargs = args.model_init_kwargs or {}
+            if quantization_config is not None:
+                if "quantization_config" in model_init_kwargs:
+                    raise ValueError(
+                        "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. "
+                        "Please set it in only one place."
+                    )
+                model_init_kwargs["quantization_config"] = quantization_config
             # Distributed training requires device_map=None ("auto" fails)
             if args.distributed_state.distributed_type in ["MULTI_GPU", "DEEPSPEED"]:
                 model_init_kwargs["device_map"] = None
@@ -372,6 +385,11 @@ def __init__(
                     "You passed `model_init_kwargs` to the `RewardConfig`, but your model is already instantiated. "
                     "The `model_init_kwargs` will be ignored."
                 )
+            if quantization_config is not None:
+                logger.warning(
+                    "You passed `quantization_config` to the trainer, but your model is already instantiated. The "
+                    "`quantization_config` will be ignored."
+                )
             # Validate that the model has num_labels = 1 (required for reward models)
             if getattr(model.config, "num_labels", None) != 1:
                 raise ValueError(
diff --git a/trl/trainer/rloo_trainer.py b/trl/trainer/rloo_trainer.py
index 9996d5a7eb0..8e301e93ebd 100644
--- a/trl/trainer/rloo_trainer.py
+++ b/trl/trainer/rloo_trainer.py
@@ -39,6 +39,7 @@
     AutoModelForSequenceClassification,
     AutoProcessor,
     AutoTokenizer,
+    BitsAndBytesConfig,
     GenerationConfig,
     PreTrainedModel,
     PreTrainedTokenizerBase,
@@ -196,6 +197,9 @@ class RLOOTrainer(_BaseTrainer):
         optimizers (`tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None]`, *optional*, defaults to `(None, None)`):
             A tuple containing the optimizer and the scheduler to use. Will default to an instance of `AdamW` on your
             model and a scheduler given by [`~transformers.get_linear_schedule_with_warmup`] controlled by `args`.
+        quantization_config ([`~transformers.BitsAndBytesConfig`], *optional*):
+            Quantization configuration used when loading the model from a model identifier. Combine with `peft_config`
+            for QLoRA training. Ignored if the model is already instantiated.
         peft_config ([`~peft.PeftConfig`], *optional*):
             PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
     """
@@ -229,6 +233,7 @@ def __init__(
         reward_processing_classes: PreTrainedTokenizerBase | list[PreTrainedTokenizerBase] | None = None,
         callbacks: list[TrainerCallback] | None = None,
         optimizers: tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None] = (None, None),
+        quantization_config: "BitsAndBytesConfig | None" = None,
         peft_config: "PeftConfig | None" = None,
     ):
         # Args
@@ -240,6 +245,13 @@ def __init__(
         # Model
         if isinstance(model, str):
             model_init_kwargs = args.model_init_kwargs or {}
+            if quantization_config is not None:
+                if "quantization_config" in model_init_kwargs:
+                    raise ValueError(
+                        "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. "
+                        "Please set it in only one place."
+                    )
+                model_init_kwargs["quantization_config"] = quantization_config
             # Distributed training requires device_map=None ("auto" fails)
             if args.distributed_state.distributed_type in ["MULTI_GPU", "DEEPSPEED"]:
                 model_init_kwargs["device_map"] = None
@@ -251,6 +263,11 @@ def __init__(
                     "You passed `model_init_kwargs` to the `RLOOConfig`, but your model is already instantiated. "
                     "The `model_init_kwargs` will be ignored."
                 )
+            if quantization_config is not None:
+                logger.warning(
+                    "You passed `quantization_config` to the trainer, but your model is already instantiated. The "
+                    "`quantization_config` will be ignored."
+                )
         # Non-quantized models do not have the `is_loaded_in_{8,4}bit` attributes, whereas quantized models do
         _is_quantized_model = getattr(model, "is_loaded_in_4bit", False) or getattr(model, "is_loaded_in_8bit", False)
 
diff --git a/trl/trainer/sft_trainer.py b/trl/trainer/sft_trainer.py
index e6dc55a04dd..7dff94421b4 100644
--- a/trl/trainer/sft_trainer.py
+++ b/trl/trainer/sft_trainer.py
@@ -34,6 +34,7 @@
 from packaging.version import Version
 from transformers import (
     AutoProcessor,
+    BitsAndBytesConfig,
     DataCollator,
     PreTrainedModel,
     PreTrainedTokenizerBase,
@@ -901,6 +902,9 @@ class SFTTrainer(_BaseTrainer):
             by this function will be reflected in the predictions received by `compute_metrics`.
 
             Note that the labels (second parameter) will be `None` if the dataset does not have them.
+        quantization_config ([`~transformers.BitsAndBytesConfig`], *optional*):
+            Quantization configuration used when loading the model from a model identifier. Combine with `peft_config`
+            for QLoRA training. Ignored if the model is already instantiated.
         peft_config ([`~peft.PeftConfig`], *optional*):
             PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
         formatting_func (`Callable`, *optional*):
@@ -925,6 +929,7 @@ def __init__(
         optimizers: tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None] = (None, None),
         optimizer_cls_and_kwargs: tuple[type[torch.optim.Optimizer], dict[str, Any]] | None = None,
         preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
+        quantization_config: "BitsAndBytesConfig | None" = None,
         peft_config: "PeftConfig | None" = None,
         formatting_func: Callable[[dict], str] | None = None,
     ):
@@ -956,6 +961,13 @@ def __init__(
         # Model
         if isinstance(model, str):
             model_init_kwargs = args.model_init_kwargs or {}
+            if quantization_config is not None:
+                if "quantization_config" in model_init_kwargs:
+                    raise ValueError(
+                        "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. "
+                        "Please set it in only one place."
+                    )
+                model_init_kwargs["quantization_config"] = quantization_config
             # Distributed training requires device_map=None ("auto" fails)
             if args.distributed_state.distributed_type in ["MULTI_GPU", "DEEPSPEED"]:
                 model_init_kwargs["device_map"] = None
@@ -967,6 +979,11 @@ def __init__(
                     "You passed `model_init_kwargs` to the `SFTConfig`, but your model is already instantiated. "
                     "The `model_init_kwargs` will be ignored."
                 )
+            if quantization_config is not None:
+                logger.warning(
+                    "You passed `quantization_config` to the trainer, but your model is already instantiated. The "
+                    "`quantization_config` will be ignored."
+                )
         # Non-quantized models do not have the `is_loaded_in_{8,4}bit` attributes, whereas quantized models do
         _is_quantized_model = getattr(model, "is_loaded_in_4bit", False) or getattr(model, "is_loaded_in_8bit", False)
 

From 38626f35233b80f7c4c8b1f364217583b3c854fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Wed, 24 Jun 2026 03:38:58 +0000
Subject: [PATCH 2/5] style

---
 trl/trainer/reward_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/trl/trainer/reward_trainer.py b/trl/trainer/reward_trainer.py
index fba474b1f79..063843a5689 100644
--- a/trl/trainer/reward_trainer.py
+++ b/trl/trainer/reward_trainer.py
@@ -312,8 +312,8 @@ class RewardTrainer(_BaseTrainer):
             Note that the labels (second parameter) will be `None` if the dataset does not have them.
         quantization_config ([`~transformers.BitsAndBytesConfig`], *optional*):
             Quantization configuration used when loading the model from a model identifier. Combine with `peft_config`
-            for QLoRA training. Ignored if the model is already instantiated, or if `quantization_config` is also set in
-            `args.model_init_kwargs`.
+            for QLoRA training. Ignored if the model is already instantiated, or if `quantization_config` is also set
+            in `args.model_init_kwargs`.
         peft_config ([`~peft.PeftConfig`], *optional*):
             PEFT configuration used to wrap the model. If `None`, the model is not wrapped. Note that if the loaded
             model is a causal LM, it's highly recommended to set `modules_to_save=["score"]` in the PEFT configuration

From 0bb426cb4db22d562277e85adcd1f279eaf31146 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Wed, 24 Jun 2026 03:42:46 +0000
Subject: [PATCH 3/5] Clarify error message for `quantization_config` to prefer
 trainer argument

---
 trl/trainer/dpo_trainer.py    | 2 +-
 trl/trainer/grpo_trainer.py   | 2 +-
 trl/trainer/reward_trainer.py | 2 +-
 trl/trainer/rloo_trainer.py   | 2 +-
 trl/trainer/sft_trainer.py    | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py
index 58f3ff1f03c..500ad964c84 100644
--- a/trl/trainer/dpo_trainer.py
+++ b/trl/trainer/dpo_trainer.py
@@ -544,7 +544,7 @@ def __init__(
                 if "quantization_config" in model_init_kwargs:
                     raise ValueError(
                         "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. "
-                        "Please set it in only one place."
+                        "Please set it in only one place, preferably as a trainer argument."
                     )
                 model_init_kwargs["quantization_config"] = quantization_config
             # Distributed training requires device_map=None ("auto" fails)
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index ded3e2e0f86..28727fb28da 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -303,7 +303,7 @@ def __init__(
                 if "quantization_config" in model_init_kwargs:
                     raise ValueError(
                         "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. "
-                        "Please set it in only one place."
+                        "Please set it in only one place, preferably as a trainer argument."
                     )
                 model_init_kwargs["quantization_config"] = quantization_config
             # Distributed training requires device_map=None ("auto" fails)
diff --git a/trl/trainer/reward_trainer.py b/trl/trainer/reward_trainer.py
index 063843a5689..9f2c3c667b4 100644
--- a/trl/trainer/reward_trainer.py
+++ b/trl/trainer/reward_trainer.py
@@ -369,7 +369,7 @@ def __init__(
                 if "quantization_config" in model_init_kwargs:
                     raise ValueError(
                         "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. "
-                        "Please set it in only one place."
+                        "Please set it in only one place, preferably as a trainer argument."
                     )
                 model_init_kwargs["quantization_config"] = quantization_config
             # Distributed training requires device_map=None ("auto" fails)
diff --git a/trl/trainer/rloo_trainer.py b/trl/trainer/rloo_trainer.py
index 8e301e93ebd..40171594d43 100644
--- a/trl/trainer/rloo_trainer.py
+++ b/trl/trainer/rloo_trainer.py
@@ -249,7 +249,7 @@ def __init__(
                 if "quantization_config" in model_init_kwargs:
                     raise ValueError(
                         "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. "
-                        "Please set it in only one place."
+                        "Please set it in only one place, preferably as a trainer argument."
                     )
                 model_init_kwargs["quantization_config"] = quantization_config
             # Distributed training requires device_map=None ("auto" fails)
diff --git a/trl/trainer/sft_trainer.py b/trl/trainer/sft_trainer.py
index 7dff94421b4..27e8cde8b47 100644
--- a/trl/trainer/sft_trainer.py
+++ b/trl/trainer/sft_trainer.py
@@ -965,7 +965,7 @@ def __init__(
                 if "quantization_config" in model_init_kwargs:
                     raise ValueError(
                         "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. "
-                        "Please set it in only one place."
+                        "Please set it in only one place, preferably as a trainer argument."
                     )
                 model_init_kwargs["quantization_config"] = quantization_config
             # Distributed training requires device_map=None ("auto" fails)

From f6a660bbadc03b658450df0d0922145af65ecd6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 26 Jun 2026 15:39:33 +0000
Subject: [PATCH 4/5] fix quantization configuration handling in trainers and
 scripts

---
 examples/scripts/grpo_vlm.py  | 7 +------
 examples/scripts/gspo.py      | 7 +------
 examples/scripts/gspo_vlm.py  | 7 +------
 examples/scripts/rloo_vlm.py  | 7 +------
 trl/trainer/dpo_trainer.py    | 6 ++++--
 trl/trainer/grpo_trainer.py   | 6 ++++--
 trl/trainer/reward_trainer.py | 2 +-
 trl/trainer/rloo_trainer.py   | 6 ++++--
 trl/trainer/sft_trainer.py    | 2 +-
 9 files changed, 18 insertions(+), 32 deletions(-)

diff --git a/examples/scripts/grpo_vlm.py b/examples/scripts/grpo_vlm.py
index c748b1b15fc..8e95902dae6 100644
--- a/examples/scripts/grpo_vlm.py
+++ b/examples/scripts/grpo_vlm.py
@@ -71,7 +71,6 @@
     ModelConfig,
     ScriptArguments,
     TrlParser,
-    get_kbit_device_map,
     get_peft_config,
     get_quantization_config,
 )
@@ -90,11 +89,6 @@
         attn_implementation=model_args.attn_implementation,
         dtype=dtype,
     )
-    quantization_config = get_quantization_config(model_args)
-    if quantization_config is not None:
-        # Passing None would not be treated the same as omitting the argument, so we include it only when valid.
-        training_args.model_init_kwargs["device_map"] = get_kbit_device_map()
-        training_args.model_init_kwargs["quantization_config"] = quantization_config
 
     ################
     # Dataset
@@ -146,6 +140,7 @@ def convert_to_rgb(example):
         reward_funcs=[think_format_reward, accuracy_reward],
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
+        quantization_config=get_quantization_config(model_args),
         peft_config=get_peft_config(model_args),
     )
 
diff --git a/examples/scripts/gspo.py b/examples/scripts/gspo.py
index 9f347fd6e3a..a1fb7b67aaf 100644
--- a/examples/scripts/gspo.py
+++ b/examples/scripts/gspo.py
@@ -60,7 +60,6 @@
     ModelConfig,
     ScriptArguments,
     TrlParser,
-    get_kbit_device_map,
     get_peft_config,
     get_quantization_config,
 )
@@ -79,11 +78,6 @@
         attn_implementation=model_args.attn_implementation,
         dtype=dtype,
     )
-    quantization_config = get_quantization_config(model_args)
-    if quantization_config is not None:
-        # Passing None would not be treated the same as omitting the argument, so we include it only when valid.
-        training_args.model_init_kwargs["device_map"] = get_kbit_device_map()
-        training_args.model_init_kwargs["quantization_config"] = quantization_config
 
     ################
     # Dataset
@@ -120,6 +114,7 @@ def make_conversation(example):
         reward_funcs=[think_format_reward, accuracy_reward],
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
+        quantization_config=get_quantization_config(model_args),
         peft_config=get_peft_config(model_args),
     )
 
diff --git a/examples/scripts/gspo_vlm.py b/examples/scripts/gspo_vlm.py
index f96c68a1819..d537c812b70 100644
--- a/examples/scripts/gspo_vlm.py
+++ b/examples/scripts/gspo_vlm.py
@@ -60,7 +60,6 @@
     ModelConfig,
     ScriptArguments,
     TrlParser,
-    get_kbit_device_map,
     get_peft_config,
     get_quantization_config,
 )
@@ -79,11 +78,6 @@
         attn_implementation=model_args.attn_implementation,
         dtype=dtype,
     )
-    quantization_config = get_quantization_config(model_args)
-    if quantization_config is not None:
-        # Passing None would not be treated the same as omitting the argument, so we include it only when valid.
-        training_args.model_init_kwargs["device_map"] = get_kbit_device_map()
-        training_args.model_init_kwargs["quantization_config"] = quantization_config
 
     ################
     # Dataset
@@ -135,6 +129,7 @@ def convert_to_rgb(example):
         reward_funcs=[think_format_reward, accuracy_reward],
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
+        quantization_config=get_quantization_config(model_args),
         peft_config=get_peft_config(model_args),
     )
 
diff --git a/examples/scripts/rloo_vlm.py b/examples/scripts/rloo_vlm.py
index 87bcdc59752..0f77b89210c 100644
--- a/examples/scripts/rloo_vlm.py
+++ b/examples/scripts/rloo_vlm.py
@@ -71,7 +71,6 @@
     RLOOTrainer,
     ScriptArguments,
     TrlParser,
-    get_kbit_device_map,
     get_peft_config,
     get_quantization_config,
 )
@@ -90,11 +89,6 @@
         attn_implementation=model_args.attn_implementation,
         dtype=dtype,
     )
-    quantization_config = get_quantization_config(model_args)
-    if quantization_config is not None:
-        # Passing None would not be treated the same as omitting the argument, so we include it only when valid.
-        training_args.model_init_kwargs["device_map"] = get_kbit_device_map()
-        training_args.model_init_kwargs["quantization_config"] = quantization_config
 
     ################
     # Dataset
@@ -146,6 +140,7 @@ def convert_to_rgb(example):
         reward_funcs=[think_format_reward, accuracy_reward],
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
+        quantization_config=get_quantization_config(model_args),
         peft_config=get_peft_config(model_args),
     )
 
diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py
index 333787c5a6c..6aa6b83dd48 100644
--- a/trl/trainer/dpo_trainer.py
+++ b/trl/trainer/dpo_trainer.py
@@ -539,7 +539,7 @@ def __init__(
 
         # Model
         if isinstance(model, str):
-            model_init_kwargs = args.model_init_kwargs or {}
+            model_init_kwargs = dict(args.model_init_kwargs or {})  # copy to avoid mutating model_init_kwargs
             if quantization_config is not None:
                 if "quantization_config" in model_init_kwargs:
                     raise ValueError(
@@ -821,7 +821,9 @@ def __init__(
                 # memory during training.
                 self.ref_model = None
             else:
-                ref_model_init_kwargs = args.model_init_kwargs or {}
+                ref_model_init_kwargs = dict(args.model_init_kwargs or {})  # copy to avoid mutating model_init_kwargs
+                if quantization_config is not None:
+                    ref_model_init_kwargs["quantization_config"] = quantization_config
                 # Distributed training requires device_map=None ("auto" fails)
                 if self.args.distributed_state.distributed_type in ["MULTI_GPU", "DEEPSPEED"]:
                     ref_model_init_kwargs["device_map"] = None
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
index 680df67822d..0050da79e67 100644
--- a/trl/trainer/grpo_trainer.py
+++ b/trl/trainer/grpo_trainer.py
@@ -298,7 +298,7 @@ def __init__(
 
         # Model
         if isinstance(model, str):
-            model_init_kwargs = args.model_init_kwargs or {}
+            model_init_kwargs = dict(args.model_init_kwargs or {})  # copy to avoid mutating model_init_kwargs
             if quantization_config is not None:
                 if "quantization_config" in model_init_kwargs:
                     raise ValueError(
@@ -775,7 +775,9 @@ def __init__(
             self.ref_model = None
         else:
             # For deepspeed, fsdp or non-distributed models, create a reference model from scratch
-            model_init_kwargs = args.model_init_kwargs or {}
+            model_init_kwargs = dict(args.model_init_kwargs or {})  # copy to avoid mutating model_init_kwargs
+            if quantization_config is not None:
+                model_init_kwargs["quantization_config"] = quantization_config
             # Distributed training requires device_map=None ("auto" fails)
             if self.args.distributed_state.distributed_type in ["MULTI_GPU", "DEEPSPEED"]:
                 model_init_kwargs["device_map"] = None
diff --git a/trl/trainer/reward_trainer.py b/trl/trainer/reward_trainer.py
index 215323b3270..49ff7c0cd43 100644
--- a/trl/trainer/reward_trainer.py
+++ b/trl/trainer/reward_trainer.py
@@ -364,7 +364,7 @@ def __init__(
         # be done before loading the model to ensure reproducibility.
         set_seed(args.seed)
         if isinstance(model, str):
-            model_init_kwargs = args.model_init_kwargs or {}
+            model_init_kwargs = dict(args.model_init_kwargs or {})  # copy to avoid mutating model_init_kwargs
             if quantization_config is not None:
                 if "quantization_config" in model_init_kwargs:
                     raise ValueError(
diff --git a/trl/trainer/rloo_trainer.py b/trl/trainer/rloo_trainer.py
index 894406fb8da..2b1142dd16b 100644
--- a/trl/trainer/rloo_trainer.py
+++ b/trl/trainer/rloo_trainer.py
@@ -244,7 +244,7 @@ def __init__(
 
         # Model
         if isinstance(model, str):
-            model_init_kwargs = args.model_init_kwargs or {}
+            model_init_kwargs = dict(args.model_init_kwargs or {})  # copy to avoid mutating model_init_kwargs
             if quantization_config is not None:
                 if "quantization_config" in model_init_kwargs:
                     raise ValueError(
@@ -547,7 +547,9 @@ def __init__(
             self.ref_model = None
         else:
             # For deepspeed, fsdp or non-distributed models, create a reference model from scratch
-            model_init_kwargs = args.model_init_kwargs or {}
+            model_init_kwargs = dict(args.model_init_kwargs or {})  # copy to avoid mutating model_init_kwargs
+            if quantization_config is not None:
+                model_init_kwargs["quantization_config"] = quantization_config
             # Distributed training requires device_map=None ("auto" fails)
             if self.args.distributed_state.distributed_type in ["MULTI_GPU", "DEEPSPEED"]:
                 model_init_kwargs["device_map"] = None
diff --git a/trl/trainer/sft_trainer.py b/trl/trainer/sft_trainer.py
index 40b1e4b2b0c..6197d626d66 100644
--- a/trl/trainer/sft_trainer.py
+++ b/trl/trainer/sft_trainer.py
@@ -960,7 +960,7 @@ def __init__(
 
         # Model
         if isinstance(model, str):
-            model_init_kwargs = args.model_init_kwargs or {}
+            model_init_kwargs = dict(args.model_init_kwargs or {})  # copy to avoid mutating model_init_kwargs
             if quantization_config is not None:
                 if "quantization_config" in model_init_kwargs:
                     raise ValueError(

From 7be97c47a0a350ac79ce8166bdbf845fd48c9494 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <gallouedec.quentin@gmail.com>
Date: Fri, 26 Jun 2026 15:48:39 +0000
Subject: [PATCH 5/5] update notebooks

---
 examples/notebooks/grpo_trl_lora_qlora.ipynb | 3044 +++++++++---------
 examples/notebooks/sft_trl_lora_qlora.ipynb  |   57 +-
 2 files changed, 1546 insertions(+), 1555 deletions(-)

diff --git a/examples/notebooks/grpo_trl_lora_qlora.ipynb b/examples/notebooks/grpo_trl_lora_qlora.ipynb
index 80375e2b4e3..fdff2cc44f1 100644
--- a/examples/notebooks/grpo_trl_lora_qlora.ipynb
+++ b/examples/notebooks/grpo_trl_lora_qlora.ipynb
@@ -1,1638 +1,1626 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "27ozP4Uy-Cz2"
-      },
-      "source": [
-        "# Group Relative Policy Optimization (GRPO) with LoRA/QLoRA using TRL — on a Free Colab Notebook\n",
-        "\n",
-        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/notebooks/grpo_trl_lora_qlora.ipynb)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "eOjY4AR1-QnF"
-      },
-      "source": [
-        "![trl banner](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl_banner_dark.png)\n",
-        "\n",
-        "Easily fine-tune **Large Language Models (LLMs)** or **Vision-Language Models (VLMs)** with **LoRA** or **QLoRA** using the [**Transformers Reinforcement Learning (TRL)**](https://github.com/huggingface/trl) library by Hugging Face and Group Relative Policy Optimization (GRPO) — all within a **free Google Colab notebook** powered by a **T4 GPU**.\n",
-        "\n",
-        "Thanks to the **built-in memory and training optimizations in TRL**, including LoRA, quantization, gradient checkpointing, and optimized attention kernels, it is possible to **fine-tune a 7B model on a free T4** with a **~7× reduction in memory consumption** compared to naive FP16 training.\n",
-        "\n",
-        "- [TRL GitHub Repository](https://github.com/huggingface/trl) — star us to support the project!  \n",
-        "- [Official TRL Examples](https://huggingface.co/docs/trl/example_overview)  \n",
-        "- [Community Tutorials](https://huggingface.co/docs/trl/community_tutorials)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "w2TnJ6ta-2zj"
-      },
-      "source": [
-        "## Key concepts\n",
-        "\n",
-        "- **GRPO**: A reinforcement learning algorithm that optimizes a policy by comparing multiple generated responses for the same prompt and updating the model based on their relative rewards, without requiring a separate value model.\n",
-        "- **LoRA**: Updates only a few low-rank parameters, reducing training cost and memory.\n",
-        "- **QLoRA**: A quantized version of LoRA that enables even larger models to fit on small GPUs.\n",
-        "- **TRL**: The Hugging Face library that makes fine-tuning and reinforcement learning simple and efficient.\n",
-        "\n",
-        "Learn how to perform **GRPO (Group Relative Policy Optimization)** with **LoRA/QLoRA** using **TRL**."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "EzScUBxoT4Nt"
-      },
-      "source": [
-        "This table demonstrates how **progressively enabling efficiency techniques** affects **memory usage** and **training throughput** across different hardware configurations.  \n",
-        "The techniques range from naive FP16 training to **LoRA, quantization, Liger kernels, paged_adamw_8bit, and gradient checkpointing**.\n",
-        "\n",
-        "| Configuration | LoRA | Quant | Liger | Optimizer | Grad. Ckpt | attn_impl  | VRAM (T4) GB | VRAM (A100-40GB)| VRAM (A100-80GB) | Tokens/s (T4) | Tokens/s (A100-40GB) | Tokens/s (A100-80GB) | Status (T4) |\n",
-        "|--------------|------|-------|-------|-----------|------------|-----------|---------------|----------------|---------|---------|---------------|------------------|-------------|\n",
-        "| **Worst (naive FP16)** | ❌ | ❌ | ❌ | AdamW | ❌  | eager | OOM | OOM | 62 GB | - | - | 0.06 it/s | ❌ |\n",
-        "| **Best (all optimizations)** | ✅ | ✅ | ✅ | paged_adamw_8bit | ✅ | sdpa  | 9.2 GB | 9.6 GB | 9.6 GB | 0.01 it/s | 0.03 it/s | 0.04 it/s | ✅ |\n",
-        "\n",
-        "With all efficiency techniques enabled, **memory usage on Colab T4 is reduced by ~7×**, making it possible to **fine-tune a 7B model on free Colab** where naive FP16 training would fail.\n",
-        "\n",
-        "> A small trade-off in training speed is observed, but the **VRAM reduction is the key enabler**. For faster training on compatible hardware, **vLLM** can also be leveraged.\n",
-        "\n",
-        "> 💡 Note: For a fair comparison, the number of generations and the batch size were not changed."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "9RFq6Op7rjc3"
-      },
-      "source": [
-        "## Install dependencies\n",
-        "\n",
-        "We'll install **TRL** with the **PEFT** extra, which ensures all main dependencies such as **Transformers** and **PEFT** (a package for parameter-efficient fine-tuning, e.g., LoRA/QLoRA) are included. Additionally, we'll install **trackio** to log and monitor our experiments, **bitsandbytes** to enable quantization of LLMs, reducing memory consumption for both inference and training, and **liger-kernel** for more efficient training."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "c2jy45nfWbdo"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install -Uq \"trl[peft]\" bitsandbytes trackio math_verify liger-kernel"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "B33zJG_Q_qb3"
-      },
-      "source": [
-        "### Log in to Hugging Face\n",
-        "\n",
-        "Log in to your **Hugging Face** account to save your fine-tuned model, track your experiment results directly on the Hub or access gated models. You can find your **access token** on your [account settings page](https://huggingface.co/settings/tokens)."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "referenced_widgets": [
-            "eec717d21e734c4da066763b4a6add7e"
-          ]
-        },
-        "id": "8zqnTyUDWbdo",
-        "outputId": "62d71aaf-352b-4736-acb9-189d78654718"
-      },
-      "outputs": [],
-      "source": [
-        "from huggingface_hub import notebook_login\n",
-        "\n",
-        "notebook_login()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "cTEw4xlFrhnQ"
-      },
-      "source": [
-        "## Load Dataset\n",
-        "\n",
-        "In this step, we load the [**AI-MO/NuminaMath-TIR**](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR) dataset from the Hugging Face Hub using the `datasets` library.\n",
-        "This dataset focuses on **mathematical reasoning**, featuring problems that require step-by-step logical solutions.\n",
-        "By fine-tuning a model that does not yet exhibit strong reasoning capabilities, it can learn to **generate structured reasoning steps**, enhancing both the model's **accuracy** and **interpretability** on math-related tasks.\n",
-        "\n",
-        "For efficiency, we'll load only a **small portion of the training split**:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "zU5icx67Wbdp",
-        "outputId": "6480b287-dc0e-4e79-feda-f5e4f41d2a82"
-      },
-      "outputs": [],
-      "source": [
-        "from datasets import load_dataset\n",
-        "\n",
-        "dataset_name = 'AI-MO/NuminaMath-TIR'\n",
-        "train_dataset = load_dataset(dataset_name, split='train[:5%]')"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "P1AIokQrBEGw"
-      },
-      "source": [
-        "Let's check the structure of the dataset"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ff6Gx1TWWbdp",
-        "outputId": "30d49bed-273a-47d9-d131-a677ca5a8b65"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Dataset({\n",
-            "    features: ['problem', 'solution', 'messages'],\n",
-            "    num_rows: 3622\n",
-            "})\n"
-          ]
-        }
-      ],
-      "source": [
-        "print(train_dataset)"
-      ]
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "27ozP4Uy-Cz2"
+   },
+   "source": [
+    "# Group Relative Policy Optimization (GRPO) with LoRA/QLoRA using TRL — on a Free Colab Notebook\n",
+    "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/notebooks/grpo_trl_lora_qlora.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "eOjY4AR1-QnF"
+   },
+   "source": [
+    "![trl banner](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl_banner_dark.png)\n",
+    "\n",
+    "Easily fine-tune **Large Language Models (LLMs)** or **Vision-Language Models (VLMs)** with **LoRA** or **QLoRA** using the [**Transformers Reinforcement Learning (TRL)**](https://github.com/huggingface/trl) library by Hugging Face and Group Relative Policy Optimization (GRPO) — all within a **free Google Colab notebook** powered by a **T4 GPU**.\n",
+    "\n",
+    "Thanks to the **built-in memory and training optimizations in TRL**, including LoRA, quantization, gradient checkpointing, and optimized attention kernels, it is possible to **fine-tune a 7B model on a free T4** with a **~7× reduction in memory consumption** compared to naive FP16 training.\n",
+    "\n",
+    "- [TRL GitHub Repository](https://github.com/huggingface/trl) — star us to support the project!  \n",
+    "- [Official TRL Examples](https://huggingface.co/docs/trl/example_overview)  \n",
+    "- [Community Tutorials](https://huggingface.co/docs/trl/community_tutorials)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "w2TnJ6ta-2zj"
+   },
+   "source": [
+    "## Key concepts\n",
+    "\n",
+    "- **GRPO**: A reinforcement learning algorithm that optimizes a policy by comparing multiple generated responses for the same prompt and updating the model based on their relative rewards, without requiring a separate value model.\n",
+    "- **LoRA**: Updates only a few low-rank parameters, reducing training cost and memory.\n",
+    "- **QLoRA**: A quantized version of LoRA that enables even larger models to fit on small GPUs.\n",
+    "- **TRL**: The Hugging Face library that makes fine-tuning and reinforcement learning simple and efficient.\n",
+    "\n",
+    "Learn how to perform **GRPO (Group Relative Policy Optimization)** with **LoRA/QLoRA** using **TRL**."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "EzScUBxoT4Nt"
+   },
+   "source": [
+    "This table demonstrates how **progressively enabling efficiency techniques** affects **memory usage** and **training throughput** across different hardware configurations.  \n",
+    "The techniques range from naive FP16 training to **LoRA, quantization, Liger kernels, paged_adamw_8bit, and gradient checkpointing**.\n",
+    "\n",
+    "| Configuration | LoRA | Quant | Liger | Optimizer | Grad. Ckpt | attn_impl  | VRAM (T4) GB | VRAM (A100-40GB)| VRAM (A100-80GB) | Tokens/s (T4) | Tokens/s (A100-40GB) | Tokens/s (A100-80GB) | Status (T4) |\n",
+    "|--------------|------|-------|-------|-----------|------------|-----------|---------------|----------------|---------|---------|---------------|------------------|-------------|\n",
+    "| **Worst (naive FP16)** | ❌ | ❌ | ❌ | AdamW | ❌  | eager | OOM | OOM | 62 GB | - | - | 0.06 it/s | ❌ |\n",
+    "| **Best (all optimizations)** | ✅ | ✅ | ✅ | paged_adamw_8bit | ✅ | sdpa  | 9.2 GB | 9.6 GB | 9.6 GB | 0.01 it/s | 0.03 it/s | 0.04 it/s | ✅ |\n",
+    "\n",
+    "With all efficiency techniques enabled, **memory usage on Colab T4 is reduced by ~7×**, making it possible to **fine-tune a 7B model on free Colab** where naive FP16 training would fail.\n",
+    "\n",
+    "> A small trade-off in training speed is observed, but the **VRAM reduction is the key enabler**. For faster training on compatible hardware, **vLLM** can also be leveraged.\n",
+    "\n",
+    "> 💡 Note: For a fair comparison, the number of generations and the batch size were not changed."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "9RFq6Op7rjc3"
+   },
+   "source": [
+    "## Install dependencies\n",
+    "\n",
+    "We'll install **TRL** with the **PEFT** extra, which ensures all main dependencies such as **Transformers** and **PEFT** (a package for parameter-efficient fine-tuning, e.g., LoRA/QLoRA) are included. Additionally, we'll install **trackio** to log and monitor our experiments, **bitsandbytes** to enable quantization of LLMs, reducing memory consumption for both inference and training, and **liger-kernel** for more efficient training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "c2jy45nfWbdo"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -Uq \"trl[peft]\" bitsandbytes trackio math_verify liger-kernel"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "B33zJG_Q_qb3"
+   },
+   "source": [
+    "### Log in to Hugging Face\n",
+    "\n",
+    "Log in to your **Hugging Face** account to save your fine-tuned model, track your experiment results directly on the Hub or access gated models. You can find your **access token** on your [account settings page](https://huggingface.co/settings/tokens)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "referenced_widgets": [
+      "eec717d21e734c4da066763b4a6add7e"
+     ]
     },
+    "id": "8zqnTyUDWbdo",
+    "outputId": "62d71aaf-352b-4736-acb9-189d78654718"
+   },
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "cTEw4xlFrhnQ"
+   },
+   "source": [
+    "## Load Dataset\n",
+    "\n",
+    "In this step, we load the [**AI-MO/NuminaMath-TIR**](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR) dataset from the Hugging Face Hub using the `datasets` library.\n",
+    "This dataset focuses on **mathematical reasoning**, featuring problems that require step-by-step logical solutions.\n",
+    "By fine-tuning a model that does not yet exhibit strong reasoning capabilities, it can learn to **generate structured reasoning steps**, enhancing both the model's **accuracy** and **interpretability** on math-related tasks.\n",
+    "\n",
+    "For efficiency, we'll load only a **small portion of the training split**:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "zU5icx67Wbdp",
+    "outputId": "6480b287-dc0e-4e79-feda-f5e4f41d2a82"
+   },
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "dataset_name = 'AI-MO/NuminaMath-TIR'\n",
+    "train_dataset = load_dataset(dataset_name, split='train[:5%]')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "P1AIokQrBEGw"
+   },
+   "source": [
+    "Let's check the structure of the dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "ff6Gx1TWWbdp",
+    "outputId": "30d49bed-273a-47d9-d131-a677ca5a8b65"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "QY5hkOqDBGns"
-      },
-      "source": [
-        "Let's check one sample:"
-      ]
-    },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset({\n",
+      "    features: ['problem', 'solution', 'messages'],\n",
+      "    num_rows: 3622\n",
+      "})\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(train_dataset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "QY5hkOqDBGns"
+   },
+   "source": [
+    "Let's check one sample:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "-y9c7i29Wbdp",
+    "outputId": "760662ea-4db4-4b8e-c234-92ae2c8ecc17"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "-y9c7i29Wbdp",
-        "outputId": "760662ea-4db4-4b8e-c234-92ae2c8ecc17"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "{'problem': 'What is the coefficient of $x^2y^6$ in the expansion of $\\\\left(\\\\frac{3}{5}x-\\\\frac{y}{2}\\\\right)^8$?  Express your answer as a common fraction.', 'solution': \"To determine the coefficient of \\\\(x^2y^6\\\\) in the expansion of \\\\(\\\\left(\\\\frac{3}{5}x - \\\\frac{y}{2}\\\\right)^8\\\\), we can use the binomial theorem.\\n\\nThe binomial theorem states:\\n\\\\[\\n(a + b)^n = \\\\sum_{k=0}^{n} \\\\binom{n}{k} a^{n-k} b^k\\n\\\\]\\n\\nIn this case, \\\\(a = \\\\frac{3}{5}x\\\\), \\\\(b = -\\\\frac{y}{2}\\\\), and \\\\(n = 8\\\\).\\n\\nWe are interested in the term that contains \\\\(x^2y^6\\\\). In the general term of the binomial expansion:\\n\\\\[\\n\\\\binom{8}{k} \\\\left(\\\\frac{3}{5}x\\\\right)^{8-k} \\\\left(-\\\\frac{y}{2}\\\\right)^k\\n\\\\]\\n\\nTo get \\\\(x^2\\\\), we need \\\\(8 - k = 2\\\\), thus \\\\(k = 6\\\\).\\n\\nSubstituting \\\\(k = 6\\\\) into the expression:\\n\\\\[\\n\\\\binom{8}{6} \\\\left(\\\\frac{3}{5}x\\\\right)^{8-6} \\\\left(-\\\\frac{y}{2}\\\\right)^6 = \\\\binom{8}{6} \\\\left(\\\\frac{3}{5}x\\\\right)^2 \\\\left(-\\\\frac{y}{2}\\\\right)^6\\n\\\\]\\n\\nNow, we will compute each part of this expression.\\n\\n1. Calculate the binomial coefficient \\\\(\\\\binom{8}{6}\\\\).\\n2. Compute \\\\(\\\\left(\\\\frac{3}{5}\\\\right)^2\\\\).\\n3. Compute \\\\(\\\\left(-\\\\frac{y}{2}\\\\right)^6\\\\).\\n4. Combine everything together to get the coefficient of \\\\(x^2y^6\\\\).\\n\\nLet's compute these in Python.\\n```python\\nfrom math import comb\\n\\n# Given values\\nn = 8\\nk = 6\\n\\n# Calculate the binomial coefficient\\nbinom_coeff = comb(n, k)\\n\\n# Compute (3/5)^2\\na_term = (3/5)**2\\n\\n# Compute (-1/2)^6\\nb_term = (-1/2)**6\\n\\n# Combine terms to get the coefficient of x^2y^6\\ncoefficient = binom_coeff * a_term * b_term\\nprint(coefficient)\\n```\\n```output\\n0.1575\\n```\\nThe coefficient of \\\\(x^2y^6\\\\) in the expansion of \\\\(\\\\left(\\\\frac{3}{5}x - \\\\frac{y}{2}\\\\right)^8\\\\) is \\\\(0.1575\\\\). To express this as a common fraction, we recognize that:\\n\\n\\\\[ 0.1575 = \\\\frac{1575}{10000} = \\\\frac{63}{400} \\\\]\\n\\nThus, the coefficient can be expressed as:\\n\\n\\\\[\\n\\\\boxed{\\\\frac{63}{400}}\\n\\\\]\", 'messages': [{'content': 'What is the coefficient of $x^2y^6$ in the expansion of $\\\\left(\\\\frac{3}{5}x-\\\\frac{y}{2}\\\\right)^8$?  Express your answer as a common fraction.', 'role': 'user'}, {'content': \"To determine the coefficient of \\\\(x^2y^6\\\\) in the expansion of \\\\(\\\\left(\\\\frac{3}{5}x - \\\\frac{y}{2}\\\\right)^8\\\\), we can use the binomial theorem.\\n\\nThe binomial theorem states:\\n\\\\[\\n(a + b)^n = \\\\sum_{k=0}^{n} \\\\binom{n}{k} a^{n-k} b^k\\n\\\\]\\n\\nIn this case, \\\\(a = \\\\frac{3}{5}x\\\\), \\\\(b = -\\\\frac{y}{2}\\\\), and \\\\(n = 8\\\\).\\n\\nWe are interested in the term that contains \\\\(x^2y^6\\\\). In the general term of the binomial expansion:\\n\\\\[\\n\\\\binom{8}{k} \\\\left(\\\\frac{3}{5}x\\\\right)^{8-k} \\\\left(-\\\\frac{y}{2}\\\\right)^k\\n\\\\]\\n\\nTo get \\\\(x^2\\\\), we need \\\\(8 - k = 2\\\\), thus \\\\(k = 6\\\\).\\n\\nSubstituting \\\\(k = 6\\\\) into the expression:\\n\\\\[\\n\\\\binom{8}{6} \\\\left(\\\\frac{3}{5}x\\\\right)^{8-6} \\\\left(-\\\\frac{y}{2}\\\\right)^6 = \\\\binom{8}{6} \\\\left(\\\\frac{3}{5}x\\\\right)^2 \\\\left(-\\\\frac{y}{2}\\\\right)^6\\n\\\\]\\n\\nNow, we will compute each part of this expression.\\n\\n1. Calculate the binomial coefficient \\\\(\\\\binom{8}{6}\\\\).\\n2. Compute \\\\(\\\\left(\\\\frac{3}{5}\\\\right)^2\\\\).\\n3. Compute \\\\(\\\\left(-\\\\frac{y}{2}\\\\right)^6\\\\).\\n4. Combine everything together to get the coefficient of \\\\(x^2y^6\\\\).\\n\\nLet's compute these in Python.\\n```python\\nfrom math import comb\\n\\n# Given values\\nn = 8\\nk = 6\\n\\n# Calculate the binomial coefficient\\nbinom_coeff = comb(n, k)\\n\\n# Compute (3/5)^2\\na_term = (3/5)**2\\n\\n# Compute (-1/2)^6\\nb_term = (-1/2)**6\\n\\n# Combine terms to get the coefficient of x^2y^6\\ncoefficient = binom_coeff * a_term * b_term\\nprint(coefficient)\\n```\\n```output\\n0.1575\\n```\\nThe coefficient of \\\\(x^2y^6\\\\) in the expansion of \\\\(\\\\left(\\\\frac{3}{5}x - \\\\frac{y}{2}\\\\right)^8\\\\) is \\\\(0.1575\\\\). To express this as a common fraction, we recognize that:\\n\\n\\\\[ 0.1575 = \\\\frac{1575}{10000} = \\\\frac{63}{400} \\\\]\\n\\nThus, the coefficient can be expressed as:\\n\\n\\\\[\\n\\\\boxed{\\\\frac{63}{400}}\\n\\\\]\", 'role': 'assistant'}]}\n"
-          ]
-        }
-      ],
-      "source": [
-        "print(train_dataset[0])"
-      ]
-    },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'problem': 'What is the coefficient of $x^2y^6$ in the expansion of $\\\\left(\\\\frac{3}{5}x-\\\\frac{y}{2}\\\\right)^8$?  Express your answer as a common fraction.', 'solution': \"To determine the coefficient of \\\\(x^2y^6\\\\) in the expansion of \\\\(\\\\left(\\\\frac{3}{5}x - \\\\frac{y}{2}\\\\right)^8\\\\), we can use the binomial theorem.\\n\\nThe binomial theorem states:\\n\\\\[\\n(a + b)^n = \\\\sum_{k=0}^{n} \\\\binom{n}{k} a^{n-k} b^k\\n\\\\]\\n\\nIn this case, \\\\(a = \\\\frac{3}{5}x\\\\), \\\\(b = -\\\\frac{y}{2}\\\\), and \\\\(n = 8\\\\).\\n\\nWe are interested in the term that contains \\\\(x^2y^6\\\\). In the general term of the binomial expansion:\\n\\\\[\\n\\\\binom{8}{k} \\\\left(\\\\frac{3}{5}x\\\\right)^{8-k} \\\\left(-\\\\frac{y}{2}\\\\right)^k\\n\\\\]\\n\\nTo get \\\\(x^2\\\\), we need \\\\(8 - k = 2\\\\), thus \\\\(k = 6\\\\).\\n\\nSubstituting \\\\(k = 6\\\\) into the expression:\\n\\\\[\\n\\\\binom{8}{6} \\\\left(\\\\frac{3}{5}x\\\\right)^{8-6} \\\\left(-\\\\frac{y}{2}\\\\right)^6 = \\\\binom{8}{6} \\\\left(\\\\frac{3}{5}x\\\\right)^2 \\\\left(-\\\\frac{y}{2}\\\\right)^6\\n\\\\]\\n\\nNow, we will compute each part of this expression.\\n\\n1. Calculate the binomial coefficient \\\\(\\\\binom{8}{6}\\\\).\\n2. Compute \\\\(\\\\left(\\\\frac{3}{5}\\\\right)^2\\\\).\\n3. Compute \\\\(\\\\left(-\\\\frac{y}{2}\\\\right)^6\\\\).\\n4. Combine everything together to get the coefficient of \\\\(x^2y^6\\\\).\\n\\nLet's compute these in Python.\\n```python\\nfrom math import comb\\n\\n# Given values\\nn = 8\\nk = 6\\n\\n# Calculate the binomial coefficient\\nbinom_coeff = comb(n, k)\\n\\n# Compute (3/5)^2\\na_term = (3/5)**2\\n\\n# Compute (-1/2)^6\\nb_term = (-1/2)**6\\n\\n# Combine terms to get the coefficient of x^2y^6\\ncoefficient = binom_coeff * a_term * b_term\\nprint(coefficient)\\n```\\n```output\\n0.1575\\n```\\nThe coefficient of \\\\(x^2y^6\\\\) in the expansion of \\\\(\\\\left(\\\\frac{3}{5}x - \\\\frac{y}{2}\\\\right)^8\\\\) is \\\\(0.1575\\\\). To express this as a common fraction, we recognize that:\\n\\n\\\\[ 0.1575 = \\\\frac{1575}{10000} = \\\\frac{63}{400} \\\\]\\n\\nThus, the coefficient can be expressed as:\\n\\n\\\\[\\n\\\\boxed{\\\\frac{63}{400}}\\n\\\\]\", 'messages': [{'content': 'What is the coefficient of $x^2y^6$ in the expansion of $\\\\left(\\\\frac{3}{5}x-\\\\frac{y}{2}\\\\right)^8$?  Express your answer as a common fraction.', 'role': 'user'}, {'content': \"To determine the coefficient of \\\\(x^2y^6\\\\) in the expansion of \\\\(\\\\left(\\\\frac{3}{5}x - \\\\frac{y}{2}\\\\right)^8\\\\), we can use the binomial theorem.\\n\\nThe binomial theorem states:\\n\\\\[\\n(a + b)^n = \\\\sum_{k=0}^{n} \\\\binom{n}{k} a^{n-k} b^k\\n\\\\]\\n\\nIn this case, \\\\(a = \\\\frac{3}{5}x\\\\), \\\\(b = -\\\\frac{y}{2}\\\\), and \\\\(n = 8\\\\).\\n\\nWe are interested in the term that contains \\\\(x^2y^6\\\\). In the general term of the binomial expansion:\\n\\\\[\\n\\\\binom{8}{k} \\\\left(\\\\frac{3}{5}x\\\\right)^{8-k} \\\\left(-\\\\frac{y}{2}\\\\right)^k\\n\\\\]\\n\\nTo get \\\\(x^2\\\\), we need \\\\(8 - k = 2\\\\), thus \\\\(k = 6\\\\).\\n\\nSubstituting \\\\(k = 6\\\\) into the expression:\\n\\\\[\\n\\\\binom{8}{6} \\\\left(\\\\frac{3}{5}x\\\\right)^{8-6} \\\\left(-\\\\frac{y}{2}\\\\right)^6 = \\\\binom{8}{6} \\\\left(\\\\frac{3}{5}x\\\\right)^2 \\\\left(-\\\\frac{y}{2}\\\\right)^6\\n\\\\]\\n\\nNow, we will compute each part of this expression.\\n\\n1. Calculate the binomial coefficient \\\\(\\\\binom{8}{6}\\\\).\\n2. Compute \\\\(\\\\left(\\\\frac{3}{5}\\\\right)^2\\\\).\\n3. Compute \\\\(\\\\left(-\\\\frac{y}{2}\\\\right)^6\\\\).\\n4. Combine everything together to get the coefficient of \\\\(x^2y^6\\\\).\\n\\nLet's compute these in Python.\\n```python\\nfrom math import comb\\n\\n# Given values\\nn = 8\\nk = 6\\n\\n# Calculate the binomial coefficient\\nbinom_coeff = comb(n, k)\\n\\n# Compute (3/5)^2\\na_term = (3/5)**2\\n\\n# Compute (-1/2)^6\\nb_term = (-1/2)**6\\n\\n# Combine terms to get the coefficient of x^2y^6\\ncoefficient = binom_coeff * a_term * b_term\\nprint(coefficient)\\n```\\n```output\\n0.1575\\n```\\nThe coefficient of \\\\(x^2y^6\\\\) in the expansion of \\\\(\\\\left(\\\\frac{3}{5}x - \\\\frac{y}{2}\\\\right)^8\\\\) is \\\\(0.1575\\\\). To express this as a common fraction, we recognize that:\\n\\n\\\\[ 0.1575 = \\\\frac{1575}{10000} = \\\\frac{63}{400} \\\\]\\n\\nThus, the coefficient can be expressed as:\\n\\n\\\\[\\n\\\\boxed{\\\\frac{63}{400}}\\n\\\\]\", 'role': 'assistant'}]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(train_dataset[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "DiqBlxK_A0SD"
+   },
+   "source": [
+    "We will adapt our dataset to a conversational format using a custom system prompt, guiding the LLM to generate both step-by-step reasoning and the final answer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "RWxK5xFKWbdp"
+   },
+   "outputs": [],
+   "source": [
+    "SYSTEM_PROMPT = (\n",
+    "    \"A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant  \"\n",
+    "    \"first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning \"\n",
+    "    \"process is enclosed strictly within <think> and </think> tags. \"\n",
+    "    \"After closing </think>, the assistant MUST provide the final answer in plain text.\"\n",
+    ")\n",
+    "\n",
+    "\n",
+    "def make_conversation(example):\n",
+    "    return {\n",
+    "        \"prompt\": [\n",
+    "            {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
+    "            {\"role\": \"user\", \"content\": example[\"problem\"]},\n",
+    "        ],\n",
+    "    }\n",
+    "\n",
+    "train_dataset = train_dataset.map(make_conversation)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "sND566XAC0kD"
+   },
+   "source": [
+    "Let's take a look at an example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Q-kHUmpMWbdp",
+    "outputId": "452beb3a-1091-46d4-997e-04b91562d66c"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "DiqBlxK_A0SD"
-      },
-      "source": [
-        "We will adapt our dataset to a conversational format using a custom system prompt, guiding the LLM to generate both step-by-step reasoning and the final answer."
-      ]
-    },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'content': 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant  first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process is enclosed strictly within <think> and </think> tags. After closing </think>, the assistant MUST provide the final answer in plain text.', 'role': 'system'}, {'content': 'What is the coefficient of $x^2y^6$ in the expansion of $\\\\left(\\\\frac{3}{5}x-\\\\frac{y}{2}\\\\right)^8$?  Express your answer as a common fraction.', 'role': 'user'}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(train_dataset[0]['prompt'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "bw0qcp-CC3G0"
+   },
+   "source": [
+    "We'll remove the `messages` and `problem` columns, as we only need the custom `prompt` column and `solution` to verify the generated answer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "SzbF3hdRWbdp",
+    "outputId": "bd59a383-1d4e-4020-c232-79ce66073fd1"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "RWxK5xFKWbdp"
-      },
-      "outputs": [],
-      "source": [
-        "SYSTEM_PROMPT = (\n",
-        "    \"A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant  \"\n",
-        "    \"first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning \"\n",
-        "    \"process is enclosed strictly within <think> and </think> tags. \"\n",
-        "    \"After closing </think>, the assistant MUST provide the final answer in plain text.\"\n",
-        ")\n",
-        "\n",
-        "\n",
-        "def make_conversation(example):\n",
-        "    return {\n",
-        "        \"prompt\": [\n",
-        "            {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
-        "            {\"role\": \"user\", \"content\": example[\"problem\"]},\n",
-        "        ],\n",
-        "    }\n",
-        "\n",
-        "train_dataset = train_dataset.map(make_conversation)"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset({\n",
+      "    features: ['solution', 'prompt'],\n",
+      "    num_rows: 3622\n",
+      "})\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_dataset = train_dataset.remove_columns(['messages', 'problem'])\n",
+    "print(train_dataset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "tvs5rjQBr7af"
+   },
+   "source": [
+    "## Load model and configure LoRA/QLoRA\n",
+    "\n",
+    "Below, choose your **preferred model**. All of the options have been tested on **free Colab instances**.\n",
+    "\n",
+    "> 💡 Note: Some models, such as Qwen2.5 and Qwen3, are known to have been pretrained on data that improves their math performance. Be cautious when selecting the appropriate model for training to ensure meaningful fine-tuning results ([source](https://thinkingmachines.ai/blog/lora/))."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "7_uaW3JfWbdp"
+   },
+   "outputs": [],
+   "source": [
+    "# Select one model below by uncommenting the line you want to use 👇\n",
+    "## Qwen\n",
+    "model_id, output_dir = \"Qwen/Qwen2-7B-Instruct\", \"t4-Qwen2-7B-Instruct-GRPO\"                             # ✅ ~9.2GB VRAM\n",
+    "# model_id, output_dir = \"unsloth/qwen3-14b-unsloth-bnb-4bit\", \"qwen3-14b-unsloth-bnb-4bit-GRPO\"         # ⚠️ OOM with this config; fits if GRPO params are reduced\n",
+    "# model_id, output_dir = \"Qwen/Qwen3-8B\", \"Qwen3-8B-GRPO\"                                                # ✅ ~9.9GB VRAM\n",
+    "# model_id, output_dir = \"Qwen/Qwen2.5-7B-Instruct\", \"Qwen2.5-7B-Instruct-GRPO\"                          # ✅ ~9.2GB VRAM\n",
+    "\n",
+    "## Llama\n",
+    "# model_id, output_dir = \"meta-llama/Llama-3.2-3B-Instruct\", \"Llama-3.2-3B-Instruct-GRPO\"             # ✅ ~5.7GB VRAM\n",
+    "# model_id, output_dir = \"meta-llama/Llama-3.1-8B-Instruct\", \"Llama-3.1-8B-Instruct-GRPO\"             # ✅ ~9.5GB VRAM\n",
+    "\n",
+    "## LFM2.5\n",
+    "# model_id, output_dir = \"LiquidAI/LFM2.5-1.2B-Instruct\", \"LFM2.5-1.2B-Instruct-GRPO\"                                   # ✅ ~1.12 GB VRAM"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "aw__94OWDnER"
+   },
+   "source": [
+    "This notebook can be used with two fine-tuning methods. By default, it is set up for **QLoRA**, which includes quantization using `BitsAndBytesConfig`. If you prefer to use standard **LoRA** without quantization, simply set `quantization_config = None` below (training without quantization consumes more memory).\n",
+    "\n",
+    "Let's configure **QLoRA** by defining a `BitsAndBytesConfig`. We pass the model id and this config directly to the trainer, which loads and quantizes the model for us. We don't need to configure the tokenizer since the trainer takes care of that automatically."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "referenced_widgets": [
+      "1130e5a744864ca5b5873731e4764983"
+     ]
     },
+    "id": "o86TnTchWbdp",
+    "outputId": "77a7e6c8-0360-40f1-eea7-b941be031366"
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from transformers import BitsAndBytesConfig\n",
+    "\n",
+    "# QLoRA: 4-bit quantization config passed to the trainer (set to None for plain LoRA).\n",
+    "quantization_config = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,                        # Load the model in 4-bit precision to save memory\n",
+    "    bnb_4bit_compute_dtype=torch.float16,     # Data type used for internal computations in quantization\n",
+    "    bnb_4bit_use_double_quant=True,           # Use double quantization to improve accuracy\n",
+    "    bnb_4bit_quant_type=\"nf4\",                # Type of quantization. \"nf4\" is recommended for recent LLMs\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "AM-G0_QmDyZC"
+   },
+   "source": [
+    "The following cell defines LoRA (or QLoRA if needed). When training with LoRA/QLoRA, we use a **base model** (the one selected above) and, instead of modifying its original weights, we fine-tune a **LoRA adapter**, a lightweight layer that enables efficient and memory-friendly training. The **`target_modules`** specify which parts of the model (e.g., attention or projection layers) will be adapted by LoRA during fine-tuning."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "WIz2pmX6Wbdp"
+   },
+   "outputs": [],
+   "source": [
+    "from peft import LoraConfig\n",
+    "\n",
+    "# You may need to update `target_modules` depending on the architecture of your chosen model.\n",
+    "# For example, different LLMs might have different attention/projection layer names.\n",
+    "peft_config = LoraConfig(\n",
+    "    r=32,\n",
+    "    lora_alpha=32,\n",
+    "    target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\",],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "prKnAp-Esyiq"
+   },
+   "source": [
+    "## Train model\n",
+    "\n",
+    "GRPO requires **reward functions** to guide the learning process. For convenience, we can directly load pre-defined rewards from `trl.rewards`, which already includes a [collection of ready-to-use rewards](https://huggingface.co/docs/trl/rewards).\n",
+    "\n",
+    "If you want to create your own custom reward functions to teach the model, a reward function is simply a Python function that takes the generated completions and returns a list of floats. For example, the following function, which we use in this notebook, rewards completions that correctly follow the `<think>` format:\n",
+    "\n",
+    "```python\n",
+    "def think_format_reward(completions: list[list[dict[str, str]]], **kwargs) -> list[float]:\n",
+    "    pattern = r\"^<think>(?!.*<think>)(.*?)</think>.*$\"\n",
+    "    completion_contents = [completion[0][\"content\"] for completion in completions]\n",
+    "    matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completion_contents]\n",
+    "    return [1.0 if match else 0.0 for match in matches]\n",
+    "```\n",
+    "\n",
+    "In this notebook, we will use both `think_format_reward`, which rewards completions that correctly follow the `<think>` format, and `reasoning_accuracy_reward`, which evaluates the correctness of the model's solution to the mathematical problem. Together, these rewards guide the model to generate **structured reasoning** while producing **accurate answers**."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "lj42Qs5vWbdp"
+   },
+   "outputs": [],
+   "source": [
+    "from trl.rewards import think_format_reward, reasoning_accuracy_reward"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "bFgYgxMbtbEZ"
+   },
+   "source": [
+    "We'll configure **GRPO** using `GRPOConfig`, keeping the parameters minimal so that the training can run on a free Colab instance. You can adjust these settings if you have access to more resources. For a complete list of available parameters and their descriptions, refer to the [TRL GRPOConfig documentation](https://huggingface.co/docs/trl/grpo_trainer#trl.GRPOConfig).\n",
+    "\n",
+    "> 💡 Note: TRL supports using **vLLM** for generation during GRPO training, which can significantly speed up training. However, it increases VRAM usage since a separate vLLM process is active to handle generation. In this notebook, we do not enable vLLM because we are using **QLoRA**, which updates the quantized vLLM model weights at every step. Enabling vLLM in this setup can cause weight precision issues and make convergence more challenging. The configuration includes the vLLM parameters in case you want to experiment with it. Learn more about vLLM integration in TRL [here](https://huggingface.co/docs/trl/main/en/vllm_integration)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "JY11EQMhWbdp"
+   },
+   "outputs": [],
+   "source": [
+    "from trl import GRPOConfig\n",
+    "\n",
+    "# Configure training arguments using GRPOConfig\n",
+    "training_args = GRPOConfig(\n",
+    "    # Model loading (passed to `from_pretrained` when the trainer loads the model)\n",
+    "    model_init_kwargs={\n",
+    "        \"attn_implementation\": \"sdpa\",                      # Change to Flash Attention if GPU has support\n",
+    "        \"dtype\": \"float32\",                                 # Change to bfloat16 if GPU has support\n",
+    "    },\n",
+    "\n",
+    "    # Training schedule / optimization\n",
+    "    learning_rate=2e-5,                                     # Learning rate for the optimizer\n",
+    "    #num_train_epochs=1,\n",
+    "    max_steps=500,                                          # Number of dataset passes. For full trainings, use `num_train_epochs` instead\n",
+    "\n",
+    "    # Parameters that control GRPO training (you can adapt them)\n",
+    "    per_device_train_batch_size = 8,\n",
+    "    max_completion_length=256, # default: 256               # Max completion length produced during training\n",
+    "    num_generations=8, # default: 8                         # Number of generations produced during trainig for comparison\n",
+    "\n",
+    "    # Optimizations\n",
+    "    optim = \"paged_adamw_8bit\",                             # Optimizer\n",
+    "    use_liger_kernel=True,                                  # Enable Liger kernel optimizations for faster training\n",
+    "\n",
+    "    # Parameters related to reporting and saving\n",
+    "    output_dir=output_dir,                                  # Where to save model checkpoints and logs\n",
+    "    logging_steps=10,                                       # Log training metrics every N steps\n",
+    "    report_to=\"trackio\",                                    # Experiment tracking tool\n",
+    "    trackio_space_id=output_dir,                            # HF Space where the experiment tracking will be saved\n",
+    "    log_completions=False,                                  # Return model completions during training\n",
+    "\n",
+    "    # Hub integration\n",
+    "    push_to_hub=True,                                       # Automatically push the trained model to the Hugging Face Hub\n",
+    "                                                            # The model will be saved under your Hub account in the repository named `output_dir`\n",
+    "    # vLLM params\n",
+    "    #use_vllm=False,                                        # Activate vLLM training for faster training\n",
+    "    #vllm_mode='colocate',\n",
+    "    #vllm_gpu_memory_utilization=0.1,\n",
+    "    #vllm_enable_sleep_mode=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "-9LlOAvWFSor"
+   },
+   "source": [
+    "Configure the `GRPOTrainer` by passing the previously defined `training_args`. To keep memory usage low, we are not using an evaluation dataset, but you can include one if desired. We also provide the reward functions that were imported earlier to guide the training process."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "iI_E9KCUWbdq"
+   },
+   "outputs": [],
+   "source": [
+    "from trl import GRPOTrainer\n",
+    "\n",
+    "trainer = GRPOTrainer(\n",
+    "    model=model_id,\n",
+    "    reward_funcs=[think_format_reward, reasoning_accuracy_reward],\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    "    quantization_config=quantization_config,\n",
+    "    peft_config=peft_config,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "8dY7bK8FGLhh"
+   },
+   "source": [
+    "Show memory stats before training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "PEVRGlrAWbdq",
+    "outputId": "78fac9e4-4ae6-4836-bd10-c30b39059782"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "sND566XAC0kD"
-      },
-      "source": [
-        "Let's take a look at an example:"
-      ]
-    },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "GPU = Tesla T4. Max memory = 14.741 GB.\n",
+      "6.773 GB of memory reserved.\n"
+     ]
+    }
+   ],
+   "source": [
+    "gpu_stats = torch.cuda.get_device_properties(0)\n",
+    "start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
+    "max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)\n",
+    "\n",
+    "print(f\"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.\")\n",
+    "print(f\"{start_gpu_memory} GB of memory reserved.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "z-5xPtfIGQL5"
+   },
+   "source": [
+    "And train!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Training on a T4 in Colab with the configuration defined in this notebook takes around 13 hours. If you're just experimenting, you can try the following quicker task ([source](https://huggingface.co/learn/llm-course/en/chapter12/5)):\n",
+    "\n",
+    "```python\n",
+    "dataset = load_dataset(\"mlabonne/smoltldr\")\n",
+    "\n",
+    "# Reward function\n",
+    "ideal_length = 50\n",
+    "\n",
+    "def reward_len(completions, **kwargs):\n",
+    "    return [-abs(ideal_length - len(completion)) for completion in completions]\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "zl7-PmoXWbdq",
+    "outputId": "f39c8c3c-43c2-4f2d-c98d-4c595ae1129f"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Q-kHUmpMWbdp",
-        "outputId": "452beb3a-1091-46d4-997e-04b91562d66c"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[{'content': 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant  first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process is enclosed strictly within <think> and </think> tags. After closing </think>, the assistant MUST provide the final answer in plain text.', 'role': 'system'}, {'content': 'What is the coefficient of $x^2y^6$ in the expansion of $\\\\left(\\\\frac{3}{5}x-\\\\frac{y}{2}\\\\right)^8$?  Express your answer as a common fraction.', 'role': 'user'}]\n"
-          ]
-        }
-      ],
-      "source": [
-        "print(train_dataset[0]['prompt'])"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "bw0qcp-CC3G0"
-      },
-      "source": [
-        "We'll remove the `messages` and `problem` columns, as we only need the custom `prompt` column and `solution` to verify the generated answer."
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "* Trackio project initialized: huggingface\n",
+      "* Trackio metrics will be synced to Hugging Face Dataset: sergiopaniego/t4-Qwen2-7B-Instruct-GRPO-dataset\n",
+      "* Creating new space: https://huggingface.co/spaces/sergiopaniego/t4-Qwen2-7B-Instruct-GRPO\n",
+      "* View dashboard by going to: https://sergiopaniego-t4-Qwen2-7B-Instruct-GRPO.hf.space/\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "SzbF3hdRWbdp",
-        "outputId": "bd59a383-1d4e-4020-c232-79ce66073fd1"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Dataset({\n",
-            "    features: ['solution', 'prompt'],\n",
-            "    num_rows: 3622\n",
-            "})\n"
-          ]
-        }
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"https://sergiopaniego-t4-Qwen2-7B-Instruct-GRPO.hf.space/\" width=\"100%\" height=\"1000px\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
       ],
-      "source": [
-        "train_dataset = train_dataset.remove_columns(['messages', 'problem'])\n",
-        "print(train_dataset)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "tvs5rjQBr7af"
-      },
-      "source": [
-        "## Load model and configure LoRA/QLoRA\n",
-        "\n",
-        "Below, choose your **preferred model**. All of the options have been tested on **free Colab instances**.\n",
-        "\n",
-        "> 💡 Note: Some models, such as Qwen2.5 and Qwen3, are known to have been pretrained on data that improves their math performance. Be cautious when selecting the appropriate model for training to ensure meaningful fine-tuning results ([source](https://thinkingmachines.ai/blog/lora/))."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "7_uaW3JfWbdp"
-      },
-      "outputs": [],
-      "source": [
-        "# Select one model below by uncommenting the line you want to use 👇\n",
-        "## Qwen\n",
-        "model_id, output_dir = \"Qwen/Qwen2-7B-Instruct\", \"t4-Qwen2-7B-Instruct-GRPO\"                             # ✅ ~9.2GB VRAM\n",
-        "# model_id, output_dir = \"unsloth/qwen3-14b-unsloth-bnb-4bit\", \"qwen3-14b-unsloth-bnb-4bit-GRPO\"         # ⚠️ OOM with this config; fits if GRPO params are reduced\n",
-        "# model_id, output_dir = \"Qwen/Qwen3-8B\", \"Qwen3-8B-GRPO\"                                                # ✅ ~9.9GB VRAM\n",
-        "# model_id, output_dir = \"Qwen/Qwen2.5-7B-Instruct\", \"Qwen2.5-7B-Instruct-GRPO\"                          # ✅ ~9.2GB VRAM\n",
-        "\n",
-        "## Llama\n",
-        "# model_id, output_dir = \"meta-llama/Llama-3.2-3B-Instruct\", \"Llama-3.2-3B-Instruct-GRPO\"             # ✅ ~5.7GB VRAM\n",
-        "# model_id, output_dir = \"meta-llama/Llama-3.1-8B-Instruct\", \"Llama-3.1-8B-Instruct-GRPO\"             # ✅ ~9.5GB VRAM\n",
-        "\n",
-        "## LFM2.5\n",
-        "# model_id, output_dir = \"LiquidAI/LFM2.5-1.2B-Instruct\", \"LFM2.5-1.2B-Instruct-GRPO\"                                   # ✅ ~1.12 GB VRAM"
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
       ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "aw__94OWDnER"
-      },
-      "source": [
-        "This notebook can be used with two fine-tuning methods. By default, it is set up for **QLoRA**, which includes quantization using `BitsAndBytesConfig`. If you prefer to use standard **LoRA** without quantization, simply comment out the `BitsAndBytesConfig` configuration (training without quantization consumes more memory).\n",
-        "\n",
-        "Let's load the selected model using `transformers`, configuring QLoRA via `bitsandbytes` (you can remove it if doing LoRA). We don't need to configure the tokenizer since the trainer takes care of that automatically."
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "* Created new run: sergiopaniego-1766143600\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "referenced_widgets": [
-            "1130e5a744864ca5b5873731e4764983"
-          ]
-        },
-        "id": "o86TnTchWbdp",
-        "outputId": "77a7e6c8-0360-40f1-eea7-b941be031366"
-      },
-      "outputs": [
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "1130e5a744864ca5b5873731e4764983",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        }
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='500' max='500' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [500/500 13:05:04, Epoch 0/1]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>10</td>\n",
+       "      <td>0.027900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>20</td>\n",
+       "      <td>-0.011600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>30</td>\n",
+       "      <td>0.021500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>40</td>\n",
+       "      <td>0.033400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>50</td>\n",
+       "      <td>0.039400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>60</td>\n",
+       "      <td>0.010300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>70</td>\n",
+       "      <td>0.048200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>80</td>\n",
+       "      <td>0.067300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>90</td>\n",
+       "      <td>0.030600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>100</td>\n",
+       "      <td>0.064000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>110</td>\n",
+       "      <td>0.021500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>120</td>\n",
+       "      <td>0.021400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>130</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>140</td>\n",
+       "      <td>-0.028500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>150</td>\n",
+       "      <td>-0.003100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>160</td>\n",
+       "      <td>0.017300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>170</td>\n",
+       "      <td>-0.024700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>180</td>\n",
+       "      <td>0.003300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>190</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>200</td>\n",
+       "      <td>-0.001400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>210</td>\n",
+       "      <td>0.008000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>220</td>\n",
+       "      <td>0.034300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>230</td>\n",
+       "      <td>0.044600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>240</td>\n",
+       "      <td>0.016400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>250</td>\n",
+       "      <td>-0.015200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>260</td>\n",
+       "      <td>0.016800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>270</td>\n",
+       "      <td>0.042900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>280</td>\n",
+       "      <td>0.031300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>290</td>\n",
+       "      <td>0.006200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>300</td>\n",
+       "      <td>0.043300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>310</td>\n",
+       "      <td>0.029700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>320</td>\n",
+       "      <td>0.001100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>330</td>\n",
+       "      <td>0.027000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>340</td>\n",
+       "      <td>-0.006700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>350</td>\n",
+       "      <td>0.027200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>360</td>\n",
+       "      <td>0.008200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>370</td>\n",
+       "      <td>-0.015800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>380</td>\n",
+       "      <td>0.007200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>390</td>\n",
+       "      <td>0.012100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>400</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>410</td>\n",
+       "      <td>0.010500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>420</td>\n",
+       "      <td>0.019800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>430</td>\n",
+       "      <td>0.000800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>440</td>\n",
+       "      <td>0.003400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>450</td>\n",
+       "      <td>-0.007900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>460</td>\n",
+       "      <td>-0.011800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>470</td>\n",
+       "      <td>-0.016300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>480</td>\n",
+       "      <td>-0.002300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>490</td>\n",
+       "      <td>-0.005500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>500</td>\n",
+       "      <td>0.038000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
       ],
-      "source": [
-        "import torch\n",
-        "from transformers import AutoModelForCausalLM, BitsAndBytesConfig\n",
-        "\n",
-        "model = AutoModelForCausalLM.from_pretrained(\n",
-        "    model_id,\n",
-        "    attn_implementation=\"sdpa\",                   # Change to Flash Attention if GPU has support\n",
-        "    dtype=\"float32\",                          # Change to bfloat16 if GPU has support\n",
-        "    quantization_config=BitsAndBytesConfig(\n",
-        "        load_in_4bit=True,                        # Load the model in 4-bit precision to save memory\n",
-        "        bnb_4bit_compute_dtype=torch.float16,     # Data type used for internal computations in quantization\n",
-        "        bnb_4bit_use_double_quant=True,           # Use double quantization to improve accuracy\n",
-        "        bnb_4bit_quant_type=\"nf4\"                 # Type of quantization. \"nf4\" is recommended for recent LLMs\n",
-        "    )\n",
-        ")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "AM-G0_QmDyZC"
-      },
-      "source": [
-        "The following cell defines LoRA (or QLoRA if needed). When training with LoRA/QLoRA, we use a **base model** (the one selected above) and, instead of modifying its original weights, we fine-tune a **LoRA adapter**, a lightweight layer that enables efficient and memory-friendly training. The **`target_modules`** specify which parts of the model (e.g., attention or projection layers) will be adapted by LoRA during fine-tuning."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "WIz2pmX6Wbdp"
-      },
-      "outputs": [],
-      "source": [
-        "from peft import LoraConfig\n",
-        "\n",
-        "# You may need to update `target_modules` depending on the architecture of your chosen model.\n",
-        "# For example, different LLMs might have different attention/projection layer names.\n",
-        "peft_config = LoraConfig(\n",
-        "    r=32,\n",
-        "    lora_alpha=32,\n",
-        "    target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\",],\n",
-        ")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "prKnAp-Esyiq"
-      },
-      "source": [
-        "## Train model\n",
-        "\n",
-        "GRPO requires **reward functions** to guide the learning process. For convenience, we can directly load pre-defined rewards from `trl.rewards`, which already includes a [collection of ready-to-use rewards](https://huggingface.co/docs/trl/rewards).\n",
-        "\n",
-        "If you want to create your own custom reward functions to teach the model, a reward function is simply a Python function that takes the generated completions and returns a list of floats. For example, the following function, which we use in this notebook, rewards completions that correctly follow the `<think>` format:\n",
-        "\n",
-        "```python\n",
-        "def think_format_reward(completions: list[list[dict[str, str]]], **kwargs) -> list[float]:\n",
-        "    pattern = r\"^<think>(?!.*<think>)(.*?)</think>.*$\"\n",
-        "    completion_contents = [completion[0][\"content\"] for completion in completions]\n",
-        "    matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completion_contents]\n",
-        "    return [1.0 if match else 0.0 for match in matches]\n",
-        "```\n",
-        "\n",
-        "In this notebook, we will use both `think_format_reward`, which rewards completions that correctly follow the `<think>` format, and `reasoning_accuracy_reward`, which evaluates the correctness of the model's solution to the mathematical problem. Together, these rewards guide the model to generate **structured reasoning** while producing **accurate answers**."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "lj42Qs5vWbdp"
-      },
-      "outputs": [],
-      "source": [
-        "from trl.rewards import think_format_reward, reasoning_accuracy_reward"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "bFgYgxMbtbEZ"
-      },
-      "source": [
-        "We'll configure **GRPO** using `GRPOConfig`, keeping the parameters minimal so that the training can run on a free Colab instance. You can adjust these settings if you have access to more resources. For a complete list of available parameters and their descriptions, refer to the [TRL GRPOConfig documentation](https://huggingface.co/docs/trl/grpo_trainer#trl.GRPOConfig).\n",
-        "\n",
-        "> 💡 Note: TRL supports using **vLLM** for generation during GRPO training, which can significantly speed up training. However, it increases VRAM usage since a separate vLLM process is active to handle generation. In this notebook, we do not enable vLLM because we are using **QLoRA**, which updates the quantized vLLM model weights at every step. Enabling vLLM in this setup can cause weight precision issues and make convergence more challenging. The configuration includes the vLLM parameters in case you want to experiment with it. Learn more about vLLM integration in TRL [here](https://huggingface.co/docs/trl/main/en/vllm_integration)."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "JY11EQMhWbdp"
-      },
-      "outputs": [],
-      "source": [
-        "from trl import GRPOConfig\n",
-        "\n",
-        "# Configure training arguments using GRPOConfig\n",
-        "training_args = GRPOConfig(\n",
-        "    # Training schedule / optimization\n",
-        "    learning_rate=2e-5,                                     # Learning rate for the optimizer\n",
-        "    #num_train_epochs=1,\n",
-        "    max_steps=500,                                          # Number of dataset passes. For full trainings, use `num_train_epochs` instead\n",
-        "\n",
-        "    # Parameters that control GRPO training (you can adapt them)\n",
-        "    per_device_train_batch_size = 8,\n",
-        "    max_completion_length=256, # default: 256               # Max completion length produced during training\n",
-        "    num_generations=8, # default: 8                         # Number of generations produced during trainig for comparison\n",
-        "\n",
-        "    # Optimizations\n",
-        "    optim = \"paged_adamw_8bit\",                             # Optimizer\n",
-        "    use_liger_kernel=True,                                  # Enable Liger kernel optimizations for faster training\n",
-        "\n",
-        "    # Parameters related to reporting and saving\n",
-        "    output_dir=output_dir,                                  # Where to save model checkpoints and logs\n",
-        "    logging_steps=10,                                       # Log training metrics every N steps\n",
-        "    report_to=\"trackio\",                                    # Experiment tracking tool\n",
-        "    trackio_space_id=output_dir,                            # HF Space where the experiment tracking will be saved\n",
-        "    log_completions=False,                                  # Return model completions during training\n",
-        "\n",
-        "    # Hub integration\n",
-        "    push_to_hub=True,                                       # Automatically push the trained model to the Hugging Face Hub\n",
-        "                                                            # The model will be saved under your Hub account in the repository named `output_dir`\n",
-        "    # vLLM params\n",
-        "    #use_vllm=False,                                        # Activate vLLM training for faster training\n",
-        "    #vllm_mode='colocate',\n",
-        "    #vllm_gpu_memory_utilization=0.1,\n",
-        "    #vllm_enable_sleep_mode=True\n",
-        ")"
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
       ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "-9LlOAvWFSor"
-      },
-      "source": [
-        "Configure the `GRPOTrainer` by passing the previously defined `training_args`. To keep memory usage low, we are not using an evaluation dataset, but you can include one if desired. We also provide the reward functions that were imported earlier to guide the training process."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "iI_E9KCUWbdq"
-      },
-      "outputs": [],
-      "source": [
-        "from trl import GRPOTrainer\n",
-        "\n",
-        "trainer = GRPOTrainer(\n",
-        "    model=model,\n",
-        "    reward_funcs=[think_format_reward, reasoning_accuracy_reward],\n",
-        "    args=training_args,\n",
-        "    train_dataset=train_dataset,\n",
-        "    peft_config=peft_config,\n",
-        ")"
-      ]
-    },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "* Run finished. Uploading logs to Trackio (please wait...)\n"
+     ]
+    }
+   ],
+   "source": [
+    "trainer_stats = trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "iqAN-XLCGTGW"
+   },
+   "source": [
+    "Show memory stats after training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "4BeEwp5EWbds",
+    "outputId": "668b8a2c-2eef-4e34-8d4a-2a43ccbbdc00"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "8dY7bK8FGLhh"
-      },
-      "source": [
-        "Show memory stats before training"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "47228.679 seconds used for training.\n",
+      "787.14 minutes used for training.\n",
+      "Peak reserved memory = 8.832 GB.\n",
+      "Peak reserved memory for training = 2.059 GB.\n",
+      "Peak reserved memory % of max memory = 59.915 %.\n",
+      "Peak reserved memory for training % of max memory = 13.968 %.\n"
+     ]
+    }
+   ],
+   "source": [
+    "used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
+    "used_memory_for_lora = round(used_memory - start_gpu_memory, 3)\n",
+    "used_percentage = round(used_memory / max_memory * 100, 3)\n",
+    "lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)\n",
+    "\n",
+    "print(f\"{trainer_stats.metrics['train_runtime']} seconds used for training.\")\n",
+    "print(f\"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.\")\n",
+    "print(f\"Peak reserved memory = {used_memory} GB.\")\n",
+    "print(f\"Peak reserved memory for training = {used_memory_for_lora} GB.\")\n",
+    "print(f\"Peak reserved memory % of max memory = {used_percentage} %.\")\n",
+    "print(f\"Peak reserved memory for training % of max memory = {lora_percentage} %.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "R8Sd_AqILeYi"
+   },
+   "source": [
+    "The training procedure generates both standard training logs and **trackio** logs, which help us monitor the training progress. Example outputs would look like the following:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "2bPn6gruLf-n"
+   },
+   "source": [
+    "<img src=\"https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/grpo-qlora-notebook-trackio.png\" width=\"50%\">"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ibO4f7tuLboQ"
+   },
+   "source": [
+    "## Saving fine tuned model\n",
+    "\n",
+    "In this step, we save the fine-tuned model both **locally** and to the **Hugging Face Hub** using the credentials from your account."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "referenced_widgets": [
+      "e6a3677667ce47bcba55e3e950e446f9",
+      "17adb84604d84cf688a89a21f6cc6150",
+      "a21c1bbd3cd04738a8c96fbfc0c016c6",
+      "65cadde3da7642188f029bb2aceaa7c6",
+      "0404b89e5ce24e76958c72bedc1a95cc",
+      "c52baf990fde40c0873747e827dc6926",
+      "191653e8ce184123a68f26fbf2b78745",
+      "0bb882d400864b249c80132264de2623",
+      "09cbfcf6e51c431798f4e392a81be6d3",
+      "d6521f73f23f42e18ee462a547f251a1"
+     ]
     },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "PEVRGlrAWbdq",
-        "outputId": "78fac9e4-4ae6-4836-bd10-c30b39059782"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "GPU = Tesla T4. Max memory = 14.741 GB.\n",
-            "6.773 GB of memory reserved.\n"
-          ]
-        }
-      ],
-      "source": [
-        "gpu_stats = torch.cuda.get_device_properties(0)\n",
-        "start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
-        "max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)\n",
-        "\n",
-        "print(f\"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.\")\n",
-        "print(f\"{start_gpu_memory} GB of memory reserved.\")"
-      ]
+    "id": "itpVDjy0Wbdt",
+    "outputId": "b821c7ed-6c9d-440a-a797-e25291627bef"
+   },
+   "outputs": [],
+   "source": [
+    "trainer.save_model(output_dir)\n",
+    "trainer.push_to_hub(dataset_name=dataset_name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "81eBZe-X7daz"
+   },
+   "source": [
+    "## Load the fine-tuned model and run inference\n",
+    "\n",
+    "Now, let's test our fine-tuned model by loading the **LoRA/QLoRA adapter** and performing **inference**. We'll start by loading the **base model**, then attach the adapter to it, creating the final fine-tuned model ready for evaluation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "referenced_widgets": [
+      "1d3fbf86d53845beac599c5b231e87ea"
+     ]
     },
+    "id": "ZLdaWYzNWbdt",
+    "outputId": "a103b64b-1f6b-4423-c5fd-402f210e6dc3"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "z-5xPtfIGQL5"
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1d3fbf86d53845beac599c5b231e87ea",
+       "version_major": 2,
+       "version_minor": 0
       },
-      "source": [
-        "And train!"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "Training on a T4 in Colab with the configuration defined in this notebook takes around 13 hours. If you're just experimenting, you can try the following quicker task ([source](https://huggingface.co/learn/llm-course/en/chapter12/5)):\n",
-        "\n",
-        "```python\n",
-        "dataset = load_dataset(\"mlabonne/smoltldr\")\n",
-        "\n",
-        "# Reward function\n",
-        "ideal_length = 50\n",
-        "\n",
-        "def reward_len(completions, **kwargs):\n",
-        "    return [-abs(ideal_length - len(completion)) for completion in completions]\n",
-        "```"
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]"
       ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+    "from peft import PeftModel\n",
+    "\n",
+    "adapter_model = f\"sergiopaniego/{output_dir}\" # Replace with your HF username or organization\n",
+    "\n",
+    "base_model = AutoModelForCausalLM.from_pretrained(model_id, dtype=\"auto\", device_map=\"auto\")\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_id)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "JvwM6ym-7nnt"
+   },
+   "source": [
+    "Let's test with one example from the test set of the dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "referenced_widgets": [
+      "74ca3f7b365640ba883a9a236700517e"
+     ]
     },
+    "id": "XjpojLV-Wbdt",
+    "outputId": "bcc039de-72ae-4713-a1fb-c006163999e7"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "zl7-PmoXWbdq",
-        "outputId": "f39c8c3c-43c2-4f2d-c98d-4c595ae1129f"
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "74ca3f7b365640ba883a9a236700517e",
+       "version_major": 2,
+       "version_minor": 0
       },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "* Trackio project initialized: huggingface\n",
-            "* Trackio metrics will be synced to Hugging Face Dataset: sergiopaniego/t4-Qwen2-7B-Instruct-GRPO-dataset\n",
-            "* Creating new space: https://huggingface.co/spaces/sergiopaniego/t4-Qwen2-7B-Instruct-GRPO\n",
-            "* View dashboard by going to: https://sergiopaniego-t4-Qwen2-7B-Instruct-GRPO.hf.space/\n"
-          ]
-        },
-        {
-          "data": {
-            "text/html": [
-              "<div><iframe src=\"https://sergiopaniego-t4-Qwen2-7B-Instruct-GRPO.hf.space/\" width=\"100%\" height=\"1000px\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
-            ],
-            "text/plain": [
-              "<IPython.core.display.HTML object>"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "* Created new run: sergiopaniego-1766143600\n"
-          ]
-        },
-        {
-          "data": {
-            "text/html": [
-              "\n",
-              "    <div>\n",
-              "      \n",
-              "      <progress value='500' max='500' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
-              "      [500/500 13:05:04, Epoch 0/1]\n",
-              "    </div>\n",
-              "    <table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              " <tr style=\"text-align: left;\">\n",
-              "      <th>Step</th>\n",
-              "      <th>Training Loss</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <td>10</td>\n",
-              "      <td>0.027900</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>20</td>\n",
-              "      <td>-0.011600</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>30</td>\n",
-              "      <td>0.021500</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>40</td>\n",
-              "      <td>0.033400</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>50</td>\n",
-              "      <td>0.039400</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>60</td>\n",
-              "      <td>0.010300</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>70</td>\n",
-              "      <td>0.048200</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>80</td>\n",
-              "      <td>0.067300</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>90</td>\n",
-              "      <td>0.030600</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>100</td>\n",
-              "      <td>0.064000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>110</td>\n",
-              "      <td>0.021500</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>120</td>\n",
-              "      <td>0.021400</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>130</td>\n",
-              "      <td>0.000000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>140</td>\n",
-              "      <td>-0.028500</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>150</td>\n",
-              "      <td>-0.003100</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>160</td>\n",
-              "      <td>0.017300</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>170</td>\n",
-              "      <td>-0.024700</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>180</td>\n",
-              "      <td>0.003300</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>190</td>\n",
-              "      <td>0.000000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>200</td>\n",
-              "      <td>-0.001400</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>210</td>\n",
-              "      <td>0.008000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>220</td>\n",
-              "      <td>0.034300</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>230</td>\n",
-              "      <td>0.044600</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>240</td>\n",
-              "      <td>0.016400</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>250</td>\n",
-              "      <td>-0.015200</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>260</td>\n",
-              "      <td>0.016800</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>270</td>\n",
-              "      <td>0.042900</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>280</td>\n",
-              "      <td>0.031300</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>290</td>\n",
-              "      <td>0.006200</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>300</td>\n",
-              "      <td>0.043300</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>310</td>\n",
-              "      <td>0.029700</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>320</td>\n",
-              "      <td>0.001100</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>330</td>\n",
-              "      <td>0.027000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>340</td>\n",
-              "      <td>-0.006700</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>350</td>\n",
-              "      <td>0.027200</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>360</td>\n",
-              "      <td>0.008200</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>370</td>\n",
-              "      <td>-0.015800</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>380</td>\n",
-              "      <td>0.007200</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>390</td>\n",
-              "      <td>0.012100</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>400</td>\n",
-              "      <td>0.000000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>410</td>\n",
-              "      <td>0.010500</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>420</td>\n",
-              "      <td>0.019800</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>430</td>\n",
-              "      <td>0.000800</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>440</td>\n",
-              "      <td>0.003400</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>450</td>\n",
-              "      <td>-0.007900</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>460</td>\n",
-              "      <td>-0.011800</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>470</td>\n",
-              "      <td>-0.016300</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>480</td>\n",
-              "      <td>-0.002300</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>490</td>\n",
-              "      <td>-0.005500</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>500</td>\n",
-              "      <td>0.038000</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table><p>"
-            ],
-            "text/plain": [
-              "<IPython.core.display.HTML object>"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "* Run finished. Uploading logs to Trackio (please wait...)\n"
-          ]
-        }
-      ],
-      "source": [
-        "trainer_stats = trainer.train()"
+      "text/plain": [
+       "Map:   0%|          | 0/1 [00:00<?, ? examples/s]"
       ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "iqAN-XLCGTGW"
-      },
-      "source": [
-        "Show memory stats after training"
+     "data": {
+      "text/plain": [
+       "[{'content': 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant  first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process is enclosed strictly within <think> and </think> tags. After closing </think>, the assistant MUST provide the final answer in plain text.',\n",
+       "  'role': 'system'},\n",
+       " {'content': \"In 1988, a person's age was equal to the sum of the digits of their birth year. How old was this person?\",\n",
+       "  'role': 'user'}]"
       ]
-    },
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "dataset_name = 'AI-MO/NuminaMath-TIR'\n",
+    "test_dataset = load_dataset(dataset_name, split='test[:1%]')\n",
+    "test_dataset = test_dataset.map(make_conversation)\n",
+    "test_dataset = test_dataset.remove_columns(['messages', 'problem'])\n",
+    "test_dataset[0]['prompt']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "CxKyZwG28BYJ"
+   },
+   "source": [
+    "Let's first check what's the output for the base model, without the adapter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "qTPJY96eWbdt",
+    "outputId": "ed02acca-e856-44ec-fa20-c32efd81e018"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "4BeEwp5EWbds",
-        "outputId": "668b8a2c-2eef-4e34-8d4a-2a43ccbbdc00"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "47228.679 seconds used for training.\n",
-            "787.14 minutes used for training.\n",
-            "Peak reserved memory = 8.832 GB.\n",
-            "Peak reserved memory for training = 2.059 GB.\n",
-            "Peak reserved memory % of max memory = 59.915 %.\n",
-            "Peak reserved memory for training % of max memory = 13.968 %.\n"
-          ]
-        }
-      ],
-      "source": [
-        "used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
-        "used_memory_for_lora = round(used_memory - start_gpu_memory, 3)\n",
-        "used_percentage = round(used_memory / max_memory * 100, 3)\n",
-        "lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)\n",
-        "\n",
-        "print(f\"{trainer_stats.metrics['train_runtime']} seconds used for training.\")\n",
-        "print(f\"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.\")\n",
-        "print(f\"Peak reserved memory = {used_memory} GB.\")\n",
-        "print(f\"Peak reserved memory for training = {used_memory_for_lora} GB.\")\n",
-        "print(f\"Peak reserved memory % of max memory = {used_percentage} %.\")\n",
-        "print(f\"Peak reserved memory for training % of max memory = {lora_percentage} %.\")"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "To solve this problem, let's denote the birth year of the person as \\(Y\\) (where \\(Y\\) is a four-digit number) and their age in 1988 as \\(A\\). According to the given condition, their age in 1988 is equal to the sum of the digits of their birth year. \n",
+      "\n",
+      "Since we're looking at the year 1988, the person would be \\(1988 - Y\\) years old in that year. Given the condition:\n",
+      "\n",
+      "\\[1988 - Y = \\text{sum of the digits of } Y\\]\n",
+      "\n",
+      "Let's break down the possible range for \\(Y\\). Since the person's age must be less than or equal to 100 (as the sum of the digits of any four-digit number cannot exceed 36), \\(Y\\) must be between 1989 and 2088.\n",
+      "\n",
+      "We can systematically check each year in this range to find when the condition holds true. However, considering the constraint on age, we can narrow our search significantly. For example, if \\(Y\\) were 1990, the sum of its digits would be 18, which is not a reasonable age. We need\n"
+     ]
+    }
+   ],
+   "source": [
+    "messages = test_dataset[0]['prompt']\n",
+    "text = tokenizer.apply_chat_template(\n",
+    "    messages, add_generation_prompt=True, tokenize=False\n",
+    ")\n",
+    "model_inputs = tokenizer([text], return_tensors=\"pt\").to(base_model.device)\n",
+    "\n",
+    "generated_ids = base_model.generate(\n",
+    "    **model_inputs,\n",
+    "    max_new_tokens=256\n",
+    ")\n",
+    "output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]\n",
+    "\n",
+    "# Decode and extract model response\n",
+    "generated_text = tokenizer.decode(output_ids, skip_special_tokens=True)\n",
+    "print(generated_text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "V9eoUwQS8SIi"
+   },
+   "source": [
+    "The base model neither produced reasoning traces nor provided a correct answer. Let's now load the fine-tuned model and check its performance."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "referenced_widgets": [
+      "073b351afd264bf0bf23043b37e0d8ce",
+      "3dee429faf4e40b192cabebfe4bf2245"
+     ]
     },
+    "id": "CNannsXXWbdt",
+    "outputId": "fc43a5b9-4ec6-43eb-fc34-f26e92434faf"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "R8Sd_AqILeYi"
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "073b351afd264bf0bf23043b37e0d8ce",
+       "version_major": 2,
+       "version_minor": 0
       },
-      "source": [
-        "The training procedure generates both standard training logs and **trackio** logs, which help us monitor the training progress. Example outputs would look like the following:"
+      "text/plain": [
+       "adapter_config.json: 0.00B [00:00, ?B/s]"
       ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "2bPn6gruLf-n"
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3dee429faf4e40b192cabebfe4bf2245",
+       "version_major": 2,
+       "version_minor": 0
       },
-      "source": [
-        "<img src=\"https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/grpo-qlora-notebook-trackio.png\" width=\"50%\">"
+      "text/plain": [
+       "adapter_model.safetensors:   0%|          | 0.00/162M [00:00<?, ?B/s]"
       ]
-    },
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "fine_tuned_model = PeftModel.from_pretrained(base_model, adapter_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "3yOJ82F9Wbdt",
+    "outputId": "f7b2d716-0ded-4ba4-9534-0481e81b4a15"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ibO4f7tuLboQ"
-      },
-      "source": [
-        "## Saving fine tuned model\n",
-        "\n",
-        "In this step, we save the fine-tuned model both **locally** and to the **Hugging Face Hub** using the credentials from your account."
-      ]
-    },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<think> I need to find a birth year where the sum of its digits equals the person's age in 1988 </think>\n",
+      "\n",
+      "The person would have been born in 1979, since 1+9+7+9 = 26 and 26 is the age in 1988\n",
+      "\n",
+      "answer: 26\n"
+     ]
+    }
+   ],
+   "source": [
+    "text = tokenizer.apply_chat_template(\n",
+    "    messages, add_generation_prompt=True, tokenize=False\n",
+    ")\n",
+    "model_inputs = tokenizer([text], return_tensors=\"pt\").to(fine_tuned_model.device)\n",
+    "\n",
+    "generated_ids = fine_tuned_model.generate(\n",
+    "    **model_inputs,\n",
+    "    max_new_tokens=256\n",
+    ")\n",
+    "output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]\n",
+    "\n",
+    "# Decode and extract model response\n",
+    "generated_text = tokenizer.decode(output_ids, skip_special_tokens=True)\n",
+    "print(generated_text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "OU-xDHpEEmg9"
+   },
+   "source": [
+    "The final answer is correct!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "XNtBOpRY8a2O"
+   },
+   "source": [
+    "## Inference and Serving with vLLM\n",
+    "\n",
+    "You can use Transformer models with **vLLM** to serve them in real-world applications. Learn more [here](https://blog.vllm.ai/2025/04/11/transformers-backend.html)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "nkhu0uY78lV3"
+   },
+   "source": [
+    "### Push Merged Model (for LoRA or QLoRA Training)\n",
+    "\n",
+    "To serve the model via **vLLM**, the repository must contain the merged model (base model + LoRA adapter). Therefore, you need to upload it first."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "NF8ZP9Z-Wbdt",
+    "outputId": "32a5ab71-1f0d-4289-ea12-66f5f75a957b"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "referenced_widgets": [
-            "e6a3677667ce47bcba55e3e950e446f9",
-            "17adb84604d84cf688a89a21f6cc6150",
-            "a21c1bbd3cd04738a8c96fbfc0c016c6",
-            "65cadde3da7642188f029bb2aceaa7c6",
-            "0404b89e5ce24e76958c72bedc1a95cc",
-            "c52baf990fde40c0873747e827dc6926",
-            "191653e8ce184123a68f26fbf2b78745",
-            "0bb882d400864b249c80132264de2623",
-            "09cbfcf6e51c431798f4e392a81be6d3",
-            "d6521f73f23f42e18ee462a547f251a1"
-          ]
-        },
-        "id": "itpVDjy0Wbdt",
-        "outputId": "b821c7ed-6c9d-440a-a797-e25291627bef"
-      },
-      "outputs": [],
-      "source": [
-        "trainer.save_model(output_dir)\n",
-        "trainer.push_to_hub(dataset_name=dataset_name)"
+     "data": {
+      "text/plain": [
+       "('Qwen2-7B-Instruct-GRPO-merged/tokenizer_config.json',\n",
+       " 'Qwen2-7B-Instruct-GRPO-merged/special_tokens_map.json',\n",
+       " 'Qwen2-7B-Instruct-GRPO-merged/chat_template.jinja',\n",
+       " 'Qwen2-7B-Instruct-GRPO-merged/vocab.json',\n",
+       " 'Qwen2-7B-Instruct-GRPO-merged/merges.txt',\n",
+       " 'Qwen2-7B-Instruct-GRPO-merged/added_tokens.json',\n",
+       " 'Qwen2-7B-Instruct-GRPO-merged/tokenizer.json')"
       ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_merged = fine_tuned_model.merge_and_unload()\n",
+    "\n",
+    "save_dir = f\"{output_dir}-merged\"\n",
+    "\n",
+    "model_merged.save_pretrained(save_dir)\n",
+    "tokenizer.save_pretrained(save_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "referenced_widgets": [
+      "d1a0574cc20046d5876cf31b21955f8b",
+      "7cc2f0ef7ad2494cad572cd898095c00",
+      "475420d92bb54dc08517ffe423b015c3",
+      "a76231aeae5a49979d1e9075b0b3eefb",
+      "b4f469f957134ea9b0e28532fe3caaf1",
+      "637e55736da34f2c9b098222ae07244a",
+      "8157e521017c450a9d2a9e41611405e9",
+      "9746ae4ab0574ed186f898dba3b4b197",
+      "d4b2a8805ec548ea85e0900ff5927574",
+      "0668cd8597f141e89ef38129c6641c1f"
+     ]
     },
+    "id": "X5Zci39rWbdt",
+    "outputId": "ca329f99-dc7b-470c-f5d9-39a3eabcb16d"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "81eBZe-X7daz"
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d1a0574cc20046d5876cf31b21955f8b",
+       "version_major": 2,
+       "version_minor": 0
       },
-      "source": [
-        "## Load the fine-tuned model and run inference\n",
-        "\n",
-        "Now, let's test our fine-tuned model by loading the **LoRA/QLoRA adapter** and performing **inference**. We'll start by loading the **base model**, then attach the adapter to it, creating the final fine-tuned model ready for evaluation."
+      "text/plain": [
+       "Processing Files (0 / 0)      : |          |  0.00B /  0.00B            "
       ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "referenced_widgets": [
-            "1d3fbf86d53845beac599c5b231e87ea"
-          ]
-        },
-        "id": "ZLdaWYzNWbdt",
-        "outputId": "a103b64b-1f6b-4423-c5fd-402f210e6dc3"
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7cc2f0ef7ad2494cad572cd898095c00",
+       "version_major": 2,
+       "version_minor": 0
       },
-      "outputs": [
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "1d3fbf86d53845beac599c5b231e87ea",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        }
-      ],
-      "source": [
-        "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
-        "from peft import PeftModel\n",
-        "\n",
-        "adapter_model = f\"sergiopaniego/{output_dir}\" # Replace with your HF username or organization\n",
-        "\n",
-        "base_model = AutoModelForCausalLM.from_pretrained(model_id, dtype=\"auto\", device_map=\"auto\")\n",
-        "\n",
-        "tokenizer = AutoTokenizer.from_pretrained(model_id)"
+      "text/plain": [
+       "New Data Upload               : |          |  0.00B /  0.00B            "
       ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "JvwM6ym-7nnt"
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "475420d92bb54dc08517ffe423b015c3",
+       "version_major": 2,
+       "version_minor": 0
       },
-      "source": [
-        "Let's test with one example from the test set of the dataset"
+      "text/plain": [
+       "  ...0002-of-00004.safetensors:   0%|          |  612kB / 4.93GB            "
       ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "referenced_widgets": [
-            "74ca3f7b365640ba883a9a236700517e"
-          ]
-        },
-        "id": "XjpojLV-Wbdt",
-        "outputId": "bcc039de-72ae-4713-a1fb-c006163999e7"
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a76231aeae5a49979d1e9075b0b3eefb",
+       "version_major": 2,
+       "version_minor": 0
       },
-      "outputs": [
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "74ca3f7b365640ba883a9a236700517e",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "Map:   0%|          | 0/1 [00:00<?, ? examples/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "text/plain": [
-              "[{'content': 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant  first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process is enclosed strictly within <think> and </think> tags. After closing </think>, the assistant MUST provide the final answer in plain text.',\n",
-              "  'role': 'system'},\n",
-              " {'content': \"In 1988, a person's age was equal to the sum of the digits of their birth year. How old was this person?\",\n",
-              "  'role': 'user'}]"
-            ]
-          },
-          "execution_count": 5,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "from datasets import load_dataset\n",
-        "\n",
-        "dataset_name = 'AI-MO/NuminaMath-TIR'\n",
-        "test_dataset = load_dataset(dataset_name, split='test[:1%]')\n",
-        "test_dataset = test_dataset.map(make_conversation)\n",
-        "test_dataset = test_dataset.remove_columns(['messages', 'problem'])\n",
-        "test_dataset[0]['prompt']"
+      "text/plain": [
+       "  ...0003-of-00004.safetensors:   0%|          |  611kB / 4.33GB            "
       ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "CxKyZwG28BYJ"
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b4f469f957134ea9b0e28532fe3caaf1",
+       "version_major": 2,
+       "version_minor": 0
       },
-      "source": [
-        "Let's first check what's the output for the base model, without the adapter."
+      "text/plain": [
+       "  ...0001-of-00004.safetensors:   1%|1         | 50.3MB / 4.88GB            "
       ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "qTPJY96eWbdt",
-        "outputId": "ed02acca-e856-44ec-fa20-c32efd81e018"
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "637e55736da34f2c9b098222ae07244a",
+       "version_major": 2,
+       "version_minor": 0
       },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "To solve this problem, let's denote the birth year of the person as \\(Y\\) (where \\(Y\\) is a four-digit number) and their age in 1988 as \\(A\\). According to the given condition, their age in 1988 is equal to the sum of the digits of their birth year. \n",
-            "\n",
-            "Since we're looking at the year 1988, the person would be \\(1988 - Y\\) years old in that year. Given the condition:\n",
-            "\n",
-            "\\[1988 - Y = \\text{sum of the digits of } Y\\]\n",
-            "\n",
-            "Let's break down the possible range for \\(Y\\). Since the person's age must be less than or equal to 100 (as the sum of the digits of any four-digit number cannot exceed 36), \\(Y\\) must be between 1989 and 2088.\n",
-            "\n",
-            "We can systematically check each year in this range to find when the condition holds true. However, considering the constraint on age, we can narrow our search significantly. For example, if \\(Y\\) were 1990, the sum of its digits would be 18, which is not a reasonable age. We need\n"
-          ]
-        }
-      ],
-      "source": [
-        "messages = test_dataset[0]['prompt']\n",
-        "text = tokenizer.apply_chat_template(\n",
-        "    messages, add_generation_prompt=True, tokenize=False\n",
-        ")\n",
-        "model_inputs = tokenizer([text], return_tensors=\"pt\").to(base_model.device)\n",
-        "\n",
-        "generated_ids = base_model.generate(\n",
-        "    **model_inputs,\n",
-        "    max_new_tokens=256\n",
-        ")\n",
-        "output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]\n",
-        "\n",
-        "# Decode and extract model response\n",
-        "generated_text = tokenizer.decode(output_ids, skip_special_tokens=True)\n",
-        "print(generated_text)"
+      "text/plain": [
+       "  ...0004-of-00004.safetensors:   4%|3         | 41.9MB / 1.09GB            "
       ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "V9eoUwQS8SIi"
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8157e521017c450a9d2a9e41611405e9",
+       "version_major": 2,
+       "version_minor": 0
       },
-      "source": [
-        "The base model neither produced reasoning traces nor provided a correct answer. Let's now load the fine-tuned model and check its performance."
+      "text/plain": [
+       "README.md: 0.00B [00:00, ?B/s]"
       ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "referenced_widgets": [
-            "073b351afd264bf0bf23043b37e0d8ce",
-            "3dee429faf4e40b192cabebfe4bf2245"
-          ]
-        },
-        "id": "CNannsXXWbdt",
-        "outputId": "fc43a5b9-4ec6-43eb-fc34-f26e92434faf"
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9746ae4ab0574ed186f898dba3b4b197",
+       "version_major": 2,
+       "version_minor": 0
       },
-      "outputs": [
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "073b351afd264bf0bf23043b37e0d8ce",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "adapter_config.json: 0.00B [00:00, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "3dee429faf4e40b192cabebfe4bf2245",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "adapter_model.safetensors:   0%|          | 0.00/162M [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        }
-      ],
-      "source": [
-        "fine_tuned_model = PeftModel.from_pretrained(base_model, adapter_model)"
+      "text/plain": [
+       "Processing Files (0 / 0)      : |          |  0.00B /  0.00B            "
       ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "3yOJ82F9Wbdt",
-        "outputId": "f7b2d716-0ded-4ba4-9534-0481e81b4a15"
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d4b2a8805ec548ea85e0900ff5927574",
+       "version_major": 2,
+       "version_minor": 0
       },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "<think> I need to find a birth year where the sum of its digits equals the person's age in 1988 </think>\n",
-            "\n",
-            "The person would have been born in 1979, since 1+9+7+9 = 26 and 26 is the age in 1988\n",
-            "\n",
-            "answer: 26\n"
-          ]
-        }
-      ],
-      "source": [
-        "text = tokenizer.apply_chat_template(\n",
-        "    messages, add_generation_prompt=True, tokenize=False\n",
-        ")\n",
-        "model_inputs = tokenizer([text], return_tensors=\"pt\").to(fine_tuned_model.device)\n",
-        "\n",
-        "generated_ids = fine_tuned_model.generate(\n",
-        "    **model_inputs,\n",
-        "    max_new_tokens=256\n",
-        ")\n",
-        "output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]\n",
-        "\n",
-        "# Decode and extract model response\n",
-        "generated_text = tokenizer.decode(output_ids, skip_special_tokens=True)\n",
-        "print(generated_text)"
+      "text/plain": [
+       "New Data Upload               : |          |  0.00B /  0.00B            "
       ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "OU-xDHpEEmg9"
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0668cd8597f141e89ef38129c6641c1f",
+       "version_major": 2,
+       "version_minor": 0
       },
-      "source": [
-        "The final answer is correct!"
+      "text/plain": [
+       "  ...RPO-merged/tokenizer.json: 100%|##########| 11.4MB / 11.4MB            "
       ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "XNtBOpRY8a2O"
+     "data": {
+      "application/vnd.google.colaboratory.intrinsic+json": {
+       "type": "string"
       },
-      "source": [
-        "## Inference and Serving with vLLM\n",
-        "\n",
-        "You can use Transformer models with **vLLM** to serve them in real-world applications. Learn more [here](https://blog.vllm.ai/2025/04/11/transformers-backend.html)."
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/sergiopaniego/Qwen2-7B-Instruct-GRPO-merged/commit/b20988444532e79a6915f0b2b6002b5acc2b53e1', commit_message='Upload tokenizer', commit_description='', oid='b20988444532e79a6915f0b2b6002b5acc2b53e1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sergiopaniego/Qwen2-7B-Instruct-GRPO-merged', endpoint='https://huggingface.co', repo_type='model', repo_id='sergiopaniego/Qwen2-7B-Instruct-GRPO-merged'), pr_revision=None, pr_num=None)"
       ]
-    },
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_merged.push_to_hub(f\"sergiopaniego/{output_dir}-merged\") # Replace with your HF username or organization\n",
+    "tokenizer.push_to_hub(f\"sergiopaniego/{output_dir}-merged\") # Replace with your HF username or organization"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "DQ00Ivxi8rFu"
+   },
+   "source": [
+    "### Performing Inference with vLLM\n",
+    "\n",
+    "Use **vLLM** to run your model and generate text efficiently in real-time. This allows you to test and deploy your fine-tuned models with low latency and high throughput."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "x7L-HIn4Wbdt",
+    "outputId": "afd66093-3525-4590-f834-c0b373e7bb9e"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nkhu0uY78lV3"
-      },
-      "source": [
-        "### Push Merged Model (for LoRA or QLoRA Training)\n",
-        "\n",
-        "To serve the model via **vLLM**, the repository must contain the merged model (base model + LoRA adapter). Therefore, you need to upload it first."
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 12-11 15:56:09 [utils.py:253] non-default args: {'dtype': torch.float16, 'max_model_len': 256, 'disable_log_stats': True, 'model_impl': 'transformers', 'model': 'sergiopaniego/Qwen2-7B-Instruct-GRPO-merged'}\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "NF8ZP9Z-Wbdt",
-        "outputId": "32a5ab71-1f0d-4289-ea12-66f5f75a957b"
-      },
-      "outputs": [
-        {
-          "data": {
-            "text/plain": [
-              "('Qwen2-7B-Instruct-GRPO-merged/tokenizer_config.json',\n",
-              " 'Qwen2-7B-Instruct-GRPO-merged/special_tokens_map.json',\n",
-              " 'Qwen2-7B-Instruct-GRPO-merged/chat_template.jinja',\n",
-              " 'Qwen2-7B-Instruct-GRPO-merged/vocab.json',\n",
-              " 'Qwen2-7B-Instruct-GRPO-merged/merges.txt',\n",
-              " 'Qwen2-7B-Instruct-GRPO-merged/added_tokens.json',\n",
-              " 'Qwen2-7B-Instruct-GRPO-merged/tokenizer.json')"
-            ]
-          },
-          "execution_count": 29,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "model_merged = fine_tuned_model.merge_and_unload()\n",
-        "\n",
-        "save_dir = f\"{output_dir}-merged\"\n",
-        "\n",
-        "model_merged.save_pretrained(save_dir)\n",
-        "tokenizer.save_pretrained(save_dir)"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:104: UserWarning: \n",
+      "Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.\n",
+      "You are not authenticated with the Hugging Face Hub in this notebook.\n",
+      "If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).\n",
+      "  warnings.warn(\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "referenced_widgets": [
-            "d1a0574cc20046d5876cf31b21955f8b",
-            "7cc2f0ef7ad2494cad572cd898095c00",
-            "475420d92bb54dc08517ffe423b015c3",
-            "a76231aeae5a49979d1e9075b0b3eefb",
-            "b4f469f957134ea9b0e28532fe3caaf1",
-            "637e55736da34f2c9b098222ae07244a",
-            "8157e521017c450a9d2a9e41611405e9",
-            "9746ae4ab0574ed186f898dba3b4b197",
-            "d4b2a8805ec548ea85e0900ff5927574",
-            "0668cd8597f141e89ef38129c6641c1f"
-          ]
-        },
-        "id": "X5Zci39rWbdt",
-        "outputId": "ca329f99-dc7b-470c-f5d9-39a3eabcb16d"
-      },
-      "outputs": [
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "d1a0574cc20046d5876cf31b21955f8b",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "Processing Files (0 / 0)      : |          |  0.00B /  0.00B            "
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "7cc2f0ef7ad2494cad572cd898095c00",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "New Data Upload               : |          |  0.00B /  0.00B            "
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "475420d92bb54dc08517ffe423b015c3",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "  ...0002-of-00004.safetensors:   0%|          |  612kB / 4.93GB            "
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "a76231aeae5a49979d1e9075b0b3eefb",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "  ...0003-of-00004.safetensors:   0%|          |  611kB / 4.33GB            "
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "b4f469f957134ea9b0e28532fe3caaf1",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "  ...0001-of-00004.safetensors:   1%|1         | 50.3MB / 4.88GB            "
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "637e55736da34f2c9b098222ae07244a",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "  ...0004-of-00004.safetensors:   4%|3         | 41.9MB / 1.09GB            "
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "8157e521017c450a9d2a9e41611405e9",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "README.md: 0.00B [00:00, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "9746ae4ab0574ed186f898dba3b4b197",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "Processing Files (0 / 0)      : |          |  0.00B /  0.00B            "
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "d4b2a8805ec548ea85e0900ff5927574",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "New Data Upload               : |          |  0.00B /  0.00B            "
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "0668cd8597f141e89ef38129c6641c1f",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "  ...RPO-merged/tokenizer.json: 100%|##########| 11.4MB / 11.4MB            "
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.google.colaboratory.intrinsic+json": {
-              "type": "string"
-            },
-            "text/plain": [
-              "CommitInfo(commit_url='https://huggingface.co/sergiopaniego/Qwen2-7B-Instruct-GRPO-merged/commit/b20988444532e79a6915f0b2b6002b5acc2b53e1', commit_message='Upload tokenizer', commit_description='', oid='b20988444532e79a6915f0b2b6002b5acc2b53e1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sergiopaniego/Qwen2-7B-Instruct-GRPO-merged', endpoint='https://huggingface.co', repo_type='model', repo_id='sergiopaniego/Qwen2-7B-Instruct-GRPO-merged'), pr_revision=None, pr_num=None)"
-            ]
-          },
-          "execution_count": 30,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "model_merged.push_to_hub(f\"sergiopaniego/{output_dir}-merged\") # Replace with your HF username or organization\n",
-        "tokenizer.push_to_hub(f\"sergiopaniego/{output_dir}-merged\") # Replace with your HF username or organization"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 12-11 15:56:37 [model.py:631] Resolved architecture: TransformersForCausalLM\n",
+      "WARNING 12-11 15:56:37 [model.py:1971] Casting torch.bfloat16 to torch.float16.\n",
+      "INFO 12-11 15:56:37 [model.py:1745] Using max model len 256\n",
+      "INFO 12-11 15:56:40 [scheduler.py:216] Chunked prefill is enabled with max_num_batched_tokens=8192.\n",
+      "WARNING 12-11 15:56:43 [system_utils.py:103] We must use the `spawn` multiprocessing start method. Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing for more information. Reasons: CUDA is initialized\n",
+      "INFO 12-11 15:57:36 [llm.py:352] Supported tasks: ['generate']\n"
+     ]
+    }
+   ],
+   "source": [
+    "from vllm import LLM, SamplingParams\n",
+    "from transformers import AutoTokenizer\n",
+    "import torch\n",
+    "\n",
+    "llm = LLM(\n",
+    "    model=f\"sergiopaniego/{output_dir}-merged\", # Replace with your HF username or organization\n",
+    "    model_impl=\"transformers\",                  # Select the transformers model implementation\n",
+    "    max_model_len=256,                         # Reduced for efficiency\n",
+    "    dtype=torch.float16\n",
+    ")\n",
+    "hf_tokenizer = AutoTokenizer.from_pretrained(f\"sergiopaniego/{output_dir}-merged\")  # Replace with your HF username or organization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "referenced_widgets": [
+      "f0a4f4fb17bf4a698503212296467547",
+      "5be7348f3f324b5b9397c9ad186fb35d"
+     ]
     },
+    "id": "ZTpSUqxNWbdt",
+    "outputId": "6a9283bf-d3b7-4e54-c775-4502694b5c6d"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "DQ00Ivxi8rFu"
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f0a4f4fb17bf4a698503212296467547",
+       "version_major": 2,
+       "version_minor": 0
       },
-      "source": [
-        "### Performing Inference with vLLM\n",
-        "\n",
-        "Use **vLLM** to run your model and generate text efficiently in real-time. This allows you to test and deploy your fine-tuned models with low latency and high throughput."
+      "text/plain": [
+       "Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]"
       ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "x7L-HIn4Wbdt",
-        "outputId": "afd66093-3525-4590-f834-c0b373e7bb9e"
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5be7348f3f324b5b9397c9ad186fb35d",
+       "version_major": 2,
+       "version_minor": 0
       },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "INFO 12-11 15:56:09 [utils.py:253] non-default args: {'dtype': torch.float16, 'max_model_len': 256, 'disable_log_stats': True, 'model_impl': 'transformers', 'model': 'sergiopaniego/Qwen2-7B-Instruct-GRPO-merged'}\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:104: UserWarning: \n",
-            "Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.\n",
-            "You are not authenticated with the Hugging Face Hub in this notebook.\n",
-            "If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).\n",
-            "  warnings.warn(\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "INFO 12-11 15:56:37 [model.py:631] Resolved architecture: TransformersForCausalLM\n",
-            "WARNING 12-11 15:56:37 [model.py:1971] Casting torch.bfloat16 to torch.float16.\n",
-            "INFO 12-11 15:56:37 [model.py:1745] Using max model len 256\n",
-            "INFO 12-11 15:56:40 [scheduler.py:216] Chunked prefill is enabled with max_num_batched_tokens=8192.\n",
-            "WARNING 12-11 15:56:43 [system_utils.py:103] We must use the `spawn` multiprocessing start method. Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing for more information. Reasons: CUDA is initialized\n",
-            "INFO 12-11 15:57:36 [llm.py:352] Supported tasks: ['generate']\n"
-          ]
-        }
-      ],
-      "source": [
-        "from vllm import LLM, SamplingParams\n",
-        "from transformers import AutoTokenizer\n",
-        "import torch\n",
-        "\n",
-        "llm = LLM(\n",
-        "    model=f\"sergiopaniego/{output_dir}-merged\", # Replace with your HF username or organization\n",
-        "    model_impl=\"transformers\",                  # Select the transformers model implementation\n",
-        "    max_model_len=256,                         # Reduced for efficiency\n",
-        "    dtype=torch.float16\n",
-        ")\n",
-        "hf_tokenizer = AutoTokenizer.from_pretrained(f\"sergiopaniego/{output_dir}-merged\")  # Replace with your HF username or organization"
+      "text/plain": [
+       "Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]"
       ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "referenced_widgets": [
-            "f0a4f4fb17bf4a698503212296467547",
-            "5be7348f3f324b5b9397c9ad186fb35d"
-          ]
-        },
-        "id": "ZTpSUqxNWbdt",
-        "outputId": "6a9283bf-d3b7-4e54-c775-4502694b5c6d"
-      },
-      "outputs": [
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "f0a4f4fb17bf4a698503212296467547",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "5be7348f3f324b5b9397c9ad186fb35d",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "<think> 1988 birth year implies the person was born either in 1979, 1980, 1981, etc. Looking for the one where sum of digits equals age </think>\n",
-            "\n",
-            "The birth year 1979 gives sum of digits 1+9+7+9 = 26\n",
-            "\n",
-            "The person was 26 years old in 1988.\n",
-            "\n",
-            "Answer: The person was 26 years old.\n"
-          ]
-        }
-      ],
-      "source": [
-        "messages = test_dataset[0]['prompt']\n",
-        "# Alternatively, use llm.chat()\n",
-        "prompt = hf_tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)\n",
-        "\n",
-        "outputs = llm.generate(\n",
-        "    {\"prompt\": prompt},\n",
-        "    sampling_params=SamplingParams(max_tokens=256),\n",
-        ")\n",
-        "\n",
-        "for o in outputs:\n",
-        "    generated_text = o.outputs[0].text\n",
-        "    print(generated_text)"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "gpuType": "T4",
-      "provenance": []
-    },
-    "language_info": {
-      "name": "python"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<think> 1988 birth year implies the person was born either in 1979, 1980, 1981, etc. Looking for the one where sum of digits equals age </think>\n",
+      "\n",
+      "The birth year 1979 gives sum of digits 1+9+7+9 = 26\n",
+      "\n",
+      "The person was 26 years old in 1988.\n",
+      "\n",
+      "Answer: The person was 26 years old.\n"
+     ]
     }
+   ],
+   "source": [
+    "messages = test_dataset[0]['prompt']\n",
+    "# Alternatively, use llm.chat()\n",
+    "prompt = hf_tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)\n",
+    "\n",
+    "outputs = llm.generate(\n",
+    "    {\"prompt\": prompt},\n",
+    "    sampling_params=SamplingParams(max_tokens=256),\n",
+    ")\n",
+    "\n",
+    "for o in outputs:\n",
+    "    generated_text = o.outputs[0].text\n",
+    "    print(generated_text)"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
diff --git a/examples/notebooks/sft_trl_lora_qlora.ipynb b/examples/notebooks/sft_trl_lora_qlora.ipynb
index f6aeb420f31..4f4933036b4 100644
--- a/examples/notebooks/sft_trl_lora_qlora.ipynb
+++ b/examples/notebooks/sft_trl_lora_qlora.ipynb
@@ -277,7 +277,7 @@
    "source": [
     "## Load model and configure LoRA/QLoRA\n",
     "\n",
-    "This notebook can be used with two fine-tuning methods. By default, it is set up for **QLoRA**, which includes quantization using `BitsAndBytesConfig`. If you prefer to use standard **LoRA** without quantization, simply comment out the `BitsAndBytesConfig` configuration.\n",
+    "This notebook can be used with two fine-tuning methods. By default, it is set up for **QLoRA**, which includes quantization using `BitsAndBytesConfig`. If you prefer to use standard **LoRA** without quantization, simply set `quantization_config = None` below.\n",
     "\n",
     "Below, choose your **preferred model**. All of the options have been tested on **free Colab instances**."
    ]
@@ -317,7 +317,7 @@
     "id": "BXY9Y0_dLWAf"
    },
    "source": [
-    "Let's load the selected model using `transformers`, configuring QLoRA via `bitsandbytes` (you can remove it if doing LoRA). We don't need to configure the tokenizer since the trainer takes care of that automatically."
+    "Let's configure **QLoRA** by defining a `BitsAndBytesConfig` (set `quantization_config = None` for plain LoRA). We pass the model id and this config directly to the trainer, which loads and quantizes the model for us. We don't need to configure the tokenizer since the trainer takes care of that automatically."
    ]
   },
   {
@@ -329,19 +329,14 @@
    "outputs": [],
    "source": [
     "import torch\n",
-    "from transformers import AutoModelForCausalLM, BitsAndBytesConfig\n",
+    "from transformers import BitsAndBytesConfig\n",
     "\n",
-    "model = AutoModelForCausalLM.from_pretrained(\n",
-    "    model_id,\n",
-    "    attn_implementation=\"sdpa\",                   # Change to Flash Attention if GPU has support\n",
-    "    dtype=torch.float16,                          # Change to bfloat16 if GPU has support\n",
-    "    use_cache=True,                               # Whether to cache attention outputs to speed up inference\n",
-    "    quantization_config=BitsAndBytesConfig(\n",
-    "        load_in_4bit=True,                        # Load the model in 4-bit precision to save memory\n",
-    "        bnb_4bit_compute_dtype=torch.float16,     # Data type used for internal computations in quantization\n",
-    "        bnb_4bit_use_double_quant=True,           # Use double quantization to improve accuracy\n",
-    "        bnb_4bit_quant_type=\"nf4\"                 # Type of quantization. \"nf4\" is recommended for recent LLMs\n",
-    "    )\n",
+    "# QLoRA: 4-bit quantization config passed to the trainer (set to None for plain LoRA).\n",
+    "quantization_config = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,                        # Load the model in 4-bit precision to save memory\n",
+    "    bnb_4bit_compute_dtype=torch.float16,     # Data type used for internal computations in quantization\n",
+    "    bnb_4bit_use_double_quant=True,           # Use double quantization to improve accuracy\n",
+    "    bnb_4bit_quant_type=\"nf4\",                # Type of quantization. \"nf4\" is recommended for recent LLMs\n",
     ")"
    ]
   },
@@ -395,6 +390,13 @@
     "from trl import SFTConfig\n",
     "\n",
     "training_args = SFTConfig(\n",
+    "    # Model loading (passed to `from_pretrained` when the trainer loads the model)\n",
+    "    model_init_kwargs={\n",
+    "        \"attn_implementation\": \"sdpa\",    # Change to Flash Attention if GPU has support\n",
+    "        \"dtype\": \"float16\",               # Change to bfloat16 if GPU has support\n",
+    "        \"use_cache\": True,                # Whether to cache attention outputs to speed up inference\n",
+    "    },\n",
+    "\n",
     "    # Training schedule / optimization\n",
     "    per_device_train_batch_size = 1,      # Batch size per GPU\n",
     "    gradient_accumulation_steps = 4,      # Gradients are accumulated over multiple steps → effective batch size = 2 * 8 = 16\n",
@@ -441,10 +443,11 @@
     "from trl import SFTTrainer\n",
     "\n",
     "trainer = SFTTrainer(\n",
-    "    model=model,\n",
+    "    model=model_id,\n",
     "    args=training_args,\n",
     "    train_dataset=train_dataset,\n",
-    "    peft_config=peft_config\n",
+    "    quantization_config=quantization_config,\n",
+    "    peft_config=peft_config,\n",
     ")"
    ]
   },
@@ -1055,14 +1058,14 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "id": "0C8MhsSoLWAx",
-    "outputId": "22af8503-64ac-42d5-f134-1d1dc68199e9",
     "colab": {
      "referenced_widgets": [
       "196152bc32a74b9994f55f483ce85dea",
       "a72d3a3407944729b65be313a47d558f"
      ]
-    }
+    },
+    "id": "0C8MhsSoLWAx",
+    "outputId": "22af8503-64ac-42d5-f134-1d1dc68199e9"
    },
    "outputs": [
     {
@@ -1122,18 +1125,18 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
   "colab": {
-   "provenance": [],
-   "gpuType": "T4"
-  },
-  "language_info": {
-   "name": "python"
+   "gpuType": "T4",
+   "provenance": []
   },
   "kernelspec": {
-   "name": "python3",
-   "display_name": "Python 3"
+   "display_name": "Python 3",
+   "name": "python3"
   },
-  "accelerator": "GPU"
+  "language_info": {
+   "name": "python"
+  }
  },
  "nbformat": 4,
  "nbformat_minor": 0