From 45d6a2decd6b170775f0fb762b110aa5f8adfab0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Wed, 24 Jun 2026 00:12:34 +0000 Subject: [PATCH 1/5] Add `quantization_config` trainer argument (streamline QLoRA) --- docs/source/peft_integration.md | 16 ++++++---------- trl/scripts/dpo.py | 8 ++------ trl/scripts/grpo.py | 8 ++------ trl/scripts/reward.py | 8 ++------ trl/scripts/rloo.py | 8 ++------ trl/scripts/sft.py | 8 ++------ trl/trainer/dpo_trainer.py | 17 +++++++++++++++++ trl/trainer/grpo_trainer.py | 17 +++++++++++++++++ trl/trainer/reward_trainer.py | 18 ++++++++++++++++++ trl/trainer/rloo_trainer.py | 17 +++++++++++++++++ trl/trainer/sft_trainer.py | 17 +++++++++++++++++ 11 files changed, 102 insertions(+), 40 deletions(-) diff --git a/docs/source/peft_integration.md b/docs/source/peft_integration.md index cfc5c180f9b..5eb660e99b8 100644 --- a/docs/source/peft_integration.md +++ b/docs/source/peft_integration.md @@ -449,11 +449,13 @@ python trl/scripts/sft.py \ #### Python Example +Pass the `quantization_config` directly to the trainer alongside `peft_config` — the trainer loads and quantizes the model for you. The same `quantization_config` argument is available on [`SFTTrainer`], [`DPOTrainer`], [`GRPOTrainer`], and [`RLOOTrainer`]. + ```python import torch from peft import LoraConfig -from transformers import AutoModelForCausalLM, BitsAndBytesConfig +from transformers import BitsAndBytesConfig from trl import SFTConfig, SFTTrainer # Configure 4-bit quantization @@ -464,13 +466,6 @@ bnb_config = BitsAndBytesConfig( bnb_4bit_use_double_quant=True, ) -# Load model with quantization -model = AutoModelForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", - quantization_config=bnb_config, - device_map="auto", -) - # Configure LoRA peft_config = LoraConfig( r=32, @@ -486,11 +481,12 @@ training_args = SFTConfig( ... ) -# Create trainer with PEFT config +# Create trainer with quantization and PEFT config trainer = SFTTrainer( - model=model, + model="meta-llama/Llama-2-7b-hf", args=training_args, train_dataset=dataset, + quantization_config=bnb_config, peft_config=peft_config, ) diff --git a/trl/scripts/dpo.py b/trl/scripts/dpo.py index 5303906dcef..a73391716af 100644 --- a/trl/scripts/dpo.py +++ b/trl/scripts/dpo.py @@ -65,7 +65,7 @@ def main(script_args, training_args, model_args, dataset_args): from accelerate.logging import get_logger from datasets import load_dataset - from trl import DPOTrainer, get_dataset, get_kbit_device_map, get_peft_config, get_quantization_config + from trl import DPOTrainer, get_dataset, get_peft_config, get_quantization_config logger = get_logger(__name__) @@ -75,11 +75,6 @@ def main(script_args, training_args, model_args, dataset_args): attn_implementation=model_args.attn_implementation, dtype=model_args.dtype, ) - quantization_config = get_quantization_config(model_args) - if quantization_config is not None: - # Passing None would not be treated the same as omitting the argument, so we include it only when valid. - training_args.model_init_kwargs["device_map"] = get_kbit_device_map() - training_args.model_init_kwargs["quantization_config"] = quantization_config # Load the dataset if dataset_args.datasets and script_args.dataset_name: @@ -103,6 +98,7 @@ def main(script_args, training_args, model_args, dataset_args): args=training_args, train_dataset=dataset[script_args.dataset_train_split], eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None, + quantization_config=get_quantization_config(model_args), peft_config=get_peft_config(model_args), ) diff --git a/trl/scripts/grpo.py b/trl/scripts/grpo.py index fde837780c5..62d2aef142c 100644 --- a/trl/scripts/grpo.py +++ b/trl/scripts/grpo.py @@ -69,7 +69,7 @@ def main(script_args, training_args, model_args, dataset_args): from accelerate.logging import get_logger from datasets import load_dataset - from trl import GRPOTrainer, get_dataset, get_kbit_device_map, get_peft_config, get_quantization_config + from trl import GRPOTrainer, get_dataset, get_peft_config, get_quantization_config from trl.rewards import ( accuracy_reward, get_soft_overlong_punishment, @@ -113,11 +113,6 @@ def main(script_args, training_args, model_args, dataset_args): attn_implementation=model_args.attn_implementation, dtype=model_args.dtype, ) - quantization_config = get_quantization_config(model_args) - if quantization_config is not None: - # Passing None would not be treated the same as omitting the argument, so we include it only when valid. - training_args.model_init_kwargs["device_map"] = get_kbit_device_map() - training_args.model_init_kwargs["quantization_config"] = quantization_config # Load the dataset if dataset_args.datasets and script_args.dataset_name: @@ -142,6 +137,7 @@ def main(script_args, training_args, model_args, dataset_args): args=training_args, train_dataset=dataset[script_args.dataset_train_split], eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None, + quantization_config=get_quantization_config(model_args), peft_config=get_peft_config(model_args), ) diff --git a/trl/scripts/reward.py b/trl/scripts/reward.py index 9cec8e9de84..6ee1dff063f 100644 --- a/trl/scripts/reward.py +++ b/trl/scripts/reward.py @@ -28,7 +28,7 @@ def main(script_args, training_args, model_args, dataset_args): from accelerate.logging import get_logger from datasets import load_dataset - from trl import RewardTrainer, get_dataset, get_kbit_device_map, get_peft_config, get_quantization_config + from trl import RewardTrainer, get_dataset, get_peft_config, get_quantization_config logger = get_logger(__name__) @@ -38,11 +38,6 @@ def main(script_args, training_args, model_args, dataset_args): attn_implementation=model_args.attn_implementation, dtype=model_args.dtype, ) - quantization_config = get_quantization_config(model_args) - if quantization_config is not None: - # Passing None would not be treated the same as omitting the argument, so we include it only when valid. - training_args.model_init_kwargs["device_map"] = get_kbit_device_map() - training_args.model_init_kwargs["quantization_config"] = quantization_config # Load the dataset if dataset_args.datasets and script_args.dataset_name: @@ -66,6 +61,7 @@ def main(script_args, training_args, model_args, dataset_args): args=training_args, train_dataset=dataset[script_args.dataset_train_split], eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None, + quantization_config=get_quantization_config(model_args), peft_config=get_peft_config(model_args), ) diff --git a/trl/scripts/rloo.py b/trl/scripts/rloo.py index 7ec6f380695..520872e3a76 100644 --- a/trl/scripts/rloo.py +++ b/trl/scripts/rloo.py @@ -69,7 +69,7 @@ def main(script_args, training_args, model_args, dataset_args): from accelerate.logging import get_logger from datasets import load_dataset - from trl import RLOOTrainer, get_dataset, get_kbit_device_map, get_peft_config, get_quantization_config + from trl import RLOOTrainer, get_dataset, get_peft_config, get_quantization_config from trl.rewards import ( accuracy_reward, get_soft_overlong_punishment, @@ -113,11 +113,6 @@ def main(script_args, training_args, model_args, dataset_args): attn_implementation=model_args.attn_implementation, dtype=model_args.dtype, ) - quantization_config = get_quantization_config(model_args) - if quantization_config is not None: - # Passing None would not be treated the same as omitting the argument, so we include it only when valid. - training_args.model_init_kwargs["device_map"] = get_kbit_device_map() - training_args.model_init_kwargs["quantization_config"] = quantization_config # Load the dataset if dataset_args.datasets and script_args.dataset_name: @@ -142,6 +137,7 @@ def main(script_args, training_args, model_args, dataset_args): args=training_args, train_dataset=dataset[script_args.dataset_train_split], eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None, + quantization_config=get_quantization_config(model_args), peft_config=get_peft_config(model_args), ) diff --git a/trl/scripts/sft.py b/trl/scripts/sft.py index 82c9622cb0d..760b9683bd6 100644 --- a/trl/scripts/sft.py +++ b/trl/scripts/sft.py @@ -67,7 +67,7 @@ def main(script_args, training_args, model_args, dataset_args): from accelerate.logging import get_logger from datasets import load_dataset - from trl import SFTTrainer, get_dataset, get_kbit_device_map, get_peft_config, get_quantization_config + from trl import SFTTrainer, get_dataset, get_peft_config, get_quantization_config logger = get_logger(__name__) @@ -77,11 +77,6 @@ def main(script_args, training_args, model_args, dataset_args): attn_implementation=model_args.attn_implementation, dtype=model_args.dtype, ) - quantization_config = get_quantization_config(model_args) - if quantization_config is not None: - # Passing None would not be treated the same as omitting the argument, so we include it only when valid. - training_args.model_init_kwargs["device_map"] = get_kbit_device_map() - training_args.model_init_kwargs["quantization_config"] = quantization_config # Load the dataset if dataset_args.datasets and script_args.dataset_name: @@ -105,6 +100,7 @@ def main(script_args, training_args, model_args, dataset_args): args=training_args, train_dataset=dataset[script_args.dataset_train_split], eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None, + quantization_config=get_quantization_config(model_args), peft_config=get_peft_config(model_args), ) diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py index 2d58d09cc17..58f3ff1f03c 100644 --- a/trl/trainer/dpo_trainer.py +++ b/trl/trainer/dpo_trainer.py @@ -34,6 +34,7 @@ from torch.utils.data import DataLoader from transformers import ( AutoProcessor, + BitsAndBytesConfig, DataCollator, PreTrainedModel, PreTrainedTokenizerBase, @@ -478,6 +479,9 @@ class DPOTrainer(_BaseTrainer): optimizers (`tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None]`, *optional*, defaults to `(None, None)`): A tuple containing the optimizer and the scheduler to use. Will default to an instance of `AdamW` on your model and a scheduler given by [`~transformers.get_linear_schedule_with_warmup`] controlled by `args`. + quantization_config ([`~transformers.BitsAndBytesConfig`], *optional*): + Quantization configuration used when loading the model from a model identifier. Combine with `peft_config` + for QLoRA training. Ignored if the model is already instantiated. peft_config ([`~peft.PeftConfig`], *optional*): PEFT configuration used to wrap the model. If `None`, the model is not wrapped. """ @@ -511,6 +515,7 @@ def __init__( compute_metrics: Callable[[EvalPrediction], dict] | None = None, callbacks: list[TrainerCallback] | None = None, optimizers: tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None] = (None, None), + quantization_config: "BitsAndBytesConfig | None" = None, peft_config: "PeftConfig | None" = None, ): # Args @@ -535,6 +540,13 @@ def __init__( # Model if isinstance(model, str): model_init_kwargs = args.model_init_kwargs or {} + if quantization_config is not None: + if "quantization_config" in model_init_kwargs: + raise ValueError( + "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. " + "Please set it in only one place." + ) + model_init_kwargs["quantization_config"] = quantization_config # Distributed training requires device_map=None ("auto" fails) if args.distributed_state.distributed_type in ["MULTI_GPU", "DEEPSPEED"]: model_init_kwargs["device_map"] = None @@ -546,6 +558,11 @@ def __init__( "You passed `model_init_kwargs` to the `DPOConfig`, but your model is already instantiated. " "The `model_init_kwargs` will be ignored." ) + if quantization_config is not None: + logger.warning( + "You passed `quantization_config` to the trainer, but your model is already instantiated. The " + "`quantization_config` will be ignored." + ) # Non-quantized models do not have the `is_loaded_in_{8,4}bit` attributes, whereas quantized models do _is_quantized_model = getattr(model, "is_loaded_in_4bit", False) or getattr(model, "is_loaded_in_8bit", False) if ref_model is model: diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 2bdc363a945..ded3e2e0f86 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -45,6 +45,7 @@ AutoModelForSequenceClassification, AutoProcessor, AutoTokenizer, + BitsAndBytesConfig, GenerationConfig, PreTrainedModel, PreTrainedTokenizerBase, @@ -226,6 +227,9 @@ class GRPOTrainer(_BaseTrainer): optimizers (`tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None]`, *optional*, defaults to `(None, None)`): A tuple containing the optimizer and the scheduler to use. Will default to an instance of `AdamW` on your model and a scheduler given by [`~transformers.get_linear_schedule_with_warmup`] controlled by `args`. + quantization_config ([`~transformers.BitsAndBytesConfig`], *optional*): + Quantization configuration used when loading the model from a model identifier. Combine with `peft_config` + for QLoRA training. Ignored if the model is already instantiated. peft_config ([`~peft.PeftConfig`], *optional*): PEFT configuration used to wrap the model. If `None`, the model is not wrapped. tools (list of `Callable`, *optional*): @@ -280,6 +284,7 @@ def __init__( reward_processing_classes: PreTrainedTokenizerBase | list[PreTrainedTokenizerBase] | None = None, callbacks: list[TrainerCallback] | None = None, optimizers: tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None] = (None, None), + quantization_config: "BitsAndBytesConfig | None" = None, peft_config: "PeftConfig | None" = None, tools: list[Callable] | None = None, rollout_func: RolloutFunc | None = None, @@ -294,6 +299,13 @@ def __init__( # Model if isinstance(model, str): model_init_kwargs = args.model_init_kwargs or {} + if quantization_config is not None: + if "quantization_config" in model_init_kwargs: + raise ValueError( + "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. " + "Please set it in only one place." + ) + model_init_kwargs["quantization_config"] = quantization_config # Distributed training requires device_map=None ("auto" fails) if args.distributed_state.distributed_type in ["MULTI_GPU", "DEEPSPEED"]: model_init_kwargs["device_map"] = None @@ -305,6 +317,11 @@ def __init__( "You passed `model_init_kwargs` to the `GRPOConfig`, but your model is already instantiated. " "The `model_init_kwargs` will be ignored." ) + if quantization_config is not None: + logger.warning( + "You passed `quantization_config` to the trainer, but your model is already instantiated. The " + "`quantization_config` will be ignored." + ) # Non-quantized models do not have the `is_loaded_in_{8,4}bit` attributes, whereas quantized models do _is_quantized_model = getattr(model, "is_loaded_in_4bit", False) or getattr(model, "is_loaded_in_8bit", False) diff --git a/trl/trainer/reward_trainer.py b/trl/trainer/reward_trainer.py index cfa6254f97c..fba474b1f79 100644 --- a/trl/trainer/reward_trainer.py +++ b/trl/trainer/reward_trainer.py @@ -36,6 +36,7 @@ from transformers import ( AutoModelForSequenceClassification, AutoTokenizer, + BitsAndBytesConfig, DataCollator, PreTrainedModel, PreTrainedTokenizerBase, @@ -309,6 +310,10 @@ class RewardTrainer(_BaseTrainer): by this function will be reflected in the predictions received by `compute_metrics`. Note that the labels (second parameter) will be `None` if the dataset does not have them. + quantization_config ([`~transformers.BitsAndBytesConfig`], *optional*): + Quantization configuration used when loading the model from a model identifier. Combine with `peft_config` + for QLoRA training. Ignored if the model is already instantiated, or if `quantization_config` is also set in + `args.model_init_kwargs`. peft_config ([`~peft.PeftConfig`], *optional*): PEFT configuration used to wrap the model. If `None`, the model is not wrapped. Note that if the loaded model is a causal LM, it's highly recommended to set `modules_to_save=["score"]` in the PEFT configuration @@ -332,6 +337,7 @@ def __init__( optimizers: tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None] = (None, None), optimizer_cls_and_kwargs: tuple[type[torch.optim.Optimizer], dict[str, Any]] | None = None, preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None, + quantization_config: "BitsAndBytesConfig | None" = None, peft_config: "PeftConfig | None" = None, ): # Args @@ -359,6 +365,13 @@ def __init__( set_seed(args.seed) if isinstance(model, str): model_init_kwargs = args.model_init_kwargs or {} + if quantization_config is not None: + if "quantization_config" in model_init_kwargs: + raise ValueError( + "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. " + "Please set it in only one place." + ) + model_init_kwargs["quantization_config"] = quantization_config # Distributed training requires device_map=None ("auto" fails) if args.distributed_state.distributed_type in ["MULTI_GPU", "DEEPSPEED"]: model_init_kwargs["device_map"] = None @@ -372,6 +385,11 @@ def __init__( "You passed `model_init_kwargs` to the `RewardConfig`, but your model is already instantiated. " "The `model_init_kwargs` will be ignored." ) + if quantization_config is not None: + logger.warning( + "You passed `quantization_config` to the trainer, but your model is already instantiated. The " + "`quantization_config` will be ignored." + ) # Validate that the model has num_labels = 1 (required for reward models) if getattr(model.config, "num_labels", None) != 1: raise ValueError( diff --git a/trl/trainer/rloo_trainer.py b/trl/trainer/rloo_trainer.py index 9996d5a7eb0..8e301e93ebd 100644 --- a/trl/trainer/rloo_trainer.py +++ b/trl/trainer/rloo_trainer.py @@ -39,6 +39,7 @@ AutoModelForSequenceClassification, AutoProcessor, AutoTokenizer, + BitsAndBytesConfig, GenerationConfig, PreTrainedModel, PreTrainedTokenizerBase, @@ -196,6 +197,9 @@ class RLOOTrainer(_BaseTrainer): optimizers (`tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None]`, *optional*, defaults to `(None, None)`): A tuple containing the optimizer and the scheduler to use. Will default to an instance of `AdamW` on your model and a scheduler given by [`~transformers.get_linear_schedule_with_warmup`] controlled by `args`. + quantization_config ([`~transformers.BitsAndBytesConfig`], *optional*): + Quantization configuration used when loading the model from a model identifier. Combine with `peft_config` + for QLoRA training. Ignored if the model is already instantiated. peft_config ([`~peft.PeftConfig`], *optional*): PEFT configuration used to wrap the model. If `None`, the model is not wrapped. """ @@ -229,6 +233,7 @@ def __init__( reward_processing_classes: PreTrainedTokenizerBase | list[PreTrainedTokenizerBase] | None = None, callbacks: list[TrainerCallback] | None = None, optimizers: tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None] = (None, None), + quantization_config: "BitsAndBytesConfig | None" = None, peft_config: "PeftConfig | None" = None, ): # Args @@ -240,6 +245,13 @@ def __init__( # Model if isinstance(model, str): model_init_kwargs = args.model_init_kwargs or {} + if quantization_config is not None: + if "quantization_config" in model_init_kwargs: + raise ValueError( + "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. " + "Please set it in only one place." + ) + model_init_kwargs["quantization_config"] = quantization_config # Distributed training requires device_map=None ("auto" fails) if args.distributed_state.distributed_type in ["MULTI_GPU", "DEEPSPEED"]: model_init_kwargs["device_map"] = None @@ -251,6 +263,11 @@ def __init__( "You passed `model_init_kwargs` to the `RLOOConfig`, but your model is already instantiated. " "The `model_init_kwargs` will be ignored." ) + if quantization_config is not None: + logger.warning( + "You passed `quantization_config` to the trainer, but your model is already instantiated. The " + "`quantization_config` will be ignored." + ) # Non-quantized models do not have the `is_loaded_in_{8,4}bit` attributes, whereas quantized models do _is_quantized_model = getattr(model, "is_loaded_in_4bit", False) or getattr(model, "is_loaded_in_8bit", False) diff --git a/trl/trainer/sft_trainer.py b/trl/trainer/sft_trainer.py index e6dc55a04dd..7dff94421b4 100644 --- a/trl/trainer/sft_trainer.py +++ b/trl/trainer/sft_trainer.py @@ -34,6 +34,7 @@ from packaging.version import Version from transformers import ( AutoProcessor, + BitsAndBytesConfig, DataCollator, PreTrainedModel, PreTrainedTokenizerBase, @@ -901,6 +902,9 @@ class SFTTrainer(_BaseTrainer): by this function will be reflected in the predictions received by `compute_metrics`. Note that the labels (second parameter) will be `None` if the dataset does not have them. + quantization_config ([`~transformers.BitsAndBytesConfig`], *optional*): + Quantization configuration used when loading the model from a model identifier. Combine with `peft_config` + for QLoRA training. Ignored if the model is already instantiated. peft_config ([`~peft.PeftConfig`], *optional*): PEFT configuration used to wrap the model. If `None`, the model is not wrapped. formatting_func (`Callable`, *optional*): @@ -925,6 +929,7 @@ def __init__( optimizers: tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None] = (None, None), optimizer_cls_and_kwargs: tuple[type[torch.optim.Optimizer], dict[str, Any]] | None = None, preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None, + quantization_config: "BitsAndBytesConfig | None" = None, peft_config: "PeftConfig | None" = None, formatting_func: Callable[[dict], str] | None = None, ): @@ -956,6 +961,13 @@ def __init__( # Model if isinstance(model, str): model_init_kwargs = args.model_init_kwargs or {} + if quantization_config is not None: + if "quantization_config" in model_init_kwargs: + raise ValueError( + "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. " + "Please set it in only one place." + ) + model_init_kwargs["quantization_config"] = quantization_config # Distributed training requires device_map=None ("auto" fails) if args.distributed_state.distributed_type in ["MULTI_GPU", "DEEPSPEED"]: model_init_kwargs["device_map"] = None @@ -967,6 +979,11 @@ def __init__( "You passed `model_init_kwargs` to the `SFTConfig`, but your model is already instantiated. " "The `model_init_kwargs` will be ignored." ) + if quantization_config is not None: + logger.warning( + "You passed `quantization_config` to the trainer, but your model is already instantiated. The " + "`quantization_config` will be ignored." + ) # Non-quantized models do not have the `is_loaded_in_{8,4}bit` attributes, whereas quantized models do _is_quantized_model = getattr(model, "is_loaded_in_4bit", False) or getattr(model, "is_loaded_in_8bit", False) From 38626f35233b80f7c4c8b1f364217583b3c854fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Wed, 24 Jun 2026 03:38:58 +0000 Subject: [PATCH 2/5] style --- trl/trainer/reward_trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/trl/trainer/reward_trainer.py b/trl/trainer/reward_trainer.py index fba474b1f79..063843a5689 100644 --- a/trl/trainer/reward_trainer.py +++ b/trl/trainer/reward_trainer.py @@ -312,8 +312,8 @@ class RewardTrainer(_BaseTrainer): Note that the labels (second parameter) will be `None` if the dataset does not have them. quantization_config ([`~transformers.BitsAndBytesConfig`], *optional*): Quantization configuration used when loading the model from a model identifier. Combine with `peft_config` - for QLoRA training. Ignored if the model is already instantiated, or if `quantization_config` is also set in - `args.model_init_kwargs`. + for QLoRA training. Ignored if the model is already instantiated, or if `quantization_config` is also set + in `args.model_init_kwargs`. peft_config ([`~peft.PeftConfig`], *optional*): PEFT configuration used to wrap the model. If `None`, the model is not wrapped. Note that if the loaded model is a causal LM, it's highly recommended to set `modules_to_save=["score"]` in the PEFT configuration From 0bb426cb4db22d562277e85adcd1f279eaf31146 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Wed, 24 Jun 2026 03:42:46 +0000 Subject: [PATCH 3/5] Clarify error message for `quantization_config` to prefer trainer argument --- trl/trainer/dpo_trainer.py | 2 +- trl/trainer/grpo_trainer.py | 2 +- trl/trainer/reward_trainer.py | 2 +- trl/trainer/rloo_trainer.py | 2 +- trl/trainer/sft_trainer.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py index 58f3ff1f03c..500ad964c84 100644 --- a/trl/trainer/dpo_trainer.py +++ b/trl/trainer/dpo_trainer.py @@ -544,7 +544,7 @@ def __init__( if "quantization_config" in model_init_kwargs: raise ValueError( "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. " - "Please set it in only one place." + "Please set it in only one place, preferably as a trainer argument." ) model_init_kwargs["quantization_config"] = quantization_config # Distributed training requires device_map=None ("auto" fails) diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index ded3e2e0f86..28727fb28da 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -303,7 +303,7 @@ def __init__( if "quantization_config" in model_init_kwargs: raise ValueError( "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. " - "Please set it in only one place." + "Please set it in only one place, preferably as a trainer argument." ) model_init_kwargs["quantization_config"] = quantization_config # Distributed training requires device_map=None ("auto" fails) diff --git a/trl/trainer/reward_trainer.py b/trl/trainer/reward_trainer.py index 063843a5689..9f2c3c667b4 100644 --- a/trl/trainer/reward_trainer.py +++ b/trl/trainer/reward_trainer.py @@ -369,7 +369,7 @@ def __init__( if "quantization_config" in model_init_kwargs: raise ValueError( "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. " - "Please set it in only one place." + "Please set it in only one place, preferably as a trainer argument." ) model_init_kwargs["quantization_config"] = quantization_config # Distributed training requires device_map=None ("auto" fails) diff --git a/trl/trainer/rloo_trainer.py b/trl/trainer/rloo_trainer.py index 8e301e93ebd..40171594d43 100644 --- a/trl/trainer/rloo_trainer.py +++ b/trl/trainer/rloo_trainer.py @@ -249,7 +249,7 @@ def __init__( if "quantization_config" in model_init_kwargs: raise ValueError( "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. " - "Please set it in only one place." + "Please set it in only one place, preferably as a trainer argument." ) model_init_kwargs["quantization_config"] = quantization_config # Distributed training requires device_map=None ("auto" fails) diff --git a/trl/trainer/sft_trainer.py b/trl/trainer/sft_trainer.py index 7dff94421b4..27e8cde8b47 100644 --- a/trl/trainer/sft_trainer.py +++ b/trl/trainer/sft_trainer.py @@ -965,7 +965,7 @@ def __init__( if "quantization_config" in model_init_kwargs: raise ValueError( "You set `quantization_config` both as a trainer argument and in `args.model_init_kwargs`. " - "Please set it in only one place." + "Please set it in only one place, preferably as a trainer argument." ) model_init_kwargs["quantization_config"] = quantization_config # Distributed training requires device_map=None ("auto" fails) From f6a660bbadc03b658450df0d0922145af65ecd6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Fri, 26 Jun 2026 15:39:33 +0000 Subject: [PATCH 4/5] fix quantization configuration handling in trainers and scripts --- examples/scripts/grpo_vlm.py | 7 +------ examples/scripts/gspo.py | 7 +------ examples/scripts/gspo_vlm.py | 7 +------ examples/scripts/rloo_vlm.py | 7 +------ trl/trainer/dpo_trainer.py | 6 ++++-- trl/trainer/grpo_trainer.py | 6 ++++-- trl/trainer/reward_trainer.py | 2 +- trl/trainer/rloo_trainer.py | 6 ++++-- trl/trainer/sft_trainer.py | 2 +- 9 files changed, 18 insertions(+), 32 deletions(-) diff --git a/examples/scripts/grpo_vlm.py b/examples/scripts/grpo_vlm.py index c748b1b15fc..8e95902dae6 100644 --- a/examples/scripts/grpo_vlm.py +++ b/examples/scripts/grpo_vlm.py @@ -71,7 +71,6 @@ ModelConfig, ScriptArguments, TrlParser, - get_kbit_device_map, get_peft_config, get_quantization_config, ) @@ -90,11 +89,6 @@ attn_implementation=model_args.attn_implementation, dtype=dtype, ) - quantization_config = get_quantization_config(model_args) - if quantization_config is not None: - # Passing None would not be treated the same as omitting the argument, so we include it only when valid. - training_args.model_init_kwargs["device_map"] = get_kbit_device_map() - training_args.model_init_kwargs["quantization_config"] = quantization_config ################ # Dataset @@ -146,6 +140,7 @@ def convert_to_rgb(example): reward_funcs=[think_format_reward, accuracy_reward], train_dataset=train_dataset, eval_dataset=eval_dataset, + quantization_config=get_quantization_config(model_args), peft_config=get_peft_config(model_args), ) diff --git a/examples/scripts/gspo.py b/examples/scripts/gspo.py index 9f347fd6e3a..a1fb7b67aaf 100644 --- a/examples/scripts/gspo.py +++ b/examples/scripts/gspo.py @@ -60,7 +60,6 @@ ModelConfig, ScriptArguments, TrlParser, - get_kbit_device_map, get_peft_config, get_quantization_config, ) @@ -79,11 +78,6 @@ attn_implementation=model_args.attn_implementation, dtype=dtype, ) - quantization_config = get_quantization_config(model_args) - if quantization_config is not None: - # Passing None would not be treated the same as omitting the argument, so we include it only when valid. - training_args.model_init_kwargs["device_map"] = get_kbit_device_map() - training_args.model_init_kwargs["quantization_config"] = quantization_config ################ # Dataset @@ -120,6 +114,7 @@ def make_conversation(example): reward_funcs=[think_format_reward, accuracy_reward], train_dataset=train_dataset, eval_dataset=eval_dataset, + quantization_config=get_quantization_config(model_args), peft_config=get_peft_config(model_args), ) diff --git a/examples/scripts/gspo_vlm.py b/examples/scripts/gspo_vlm.py index f96c68a1819..d537c812b70 100644 --- a/examples/scripts/gspo_vlm.py +++ b/examples/scripts/gspo_vlm.py @@ -60,7 +60,6 @@ ModelConfig, ScriptArguments, TrlParser, - get_kbit_device_map, get_peft_config, get_quantization_config, ) @@ -79,11 +78,6 @@ attn_implementation=model_args.attn_implementation, dtype=dtype, ) - quantization_config = get_quantization_config(model_args) - if quantization_config is not None: - # Passing None would not be treated the same as omitting the argument, so we include it only when valid. - training_args.model_init_kwargs["device_map"] = get_kbit_device_map() - training_args.model_init_kwargs["quantization_config"] = quantization_config ################ # Dataset @@ -135,6 +129,7 @@ def convert_to_rgb(example): reward_funcs=[think_format_reward, accuracy_reward], train_dataset=train_dataset, eval_dataset=eval_dataset, + quantization_config=get_quantization_config(model_args), peft_config=get_peft_config(model_args), ) diff --git a/examples/scripts/rloo_vlm.py b/examples/scripts/rloo_vlm.py index 87bcdc59752..0f77b89210c 100644 --- a/examples/scripts/rloo_vlm.py +++ b/examples/scripts/rloo_vlm.py @@ -71,7 +71,6 @@ RLOOTrainer, ScriptArguments, TrlParser, - get_kbit_device_map, get_peft_config, get_quantization_config, ) @@ -90,11 +89,6 @@ attn_implementation=model_args.attn_implementation, dtype=dtype, ) - quantization_config = get_quantization_config(model_args) - if quantization_config is not None: - # Passing None would not be treated the same as omitting the argument, so we include it only when valid. - training_args.model_init_kwargs["device_map"] = get_kbit_device_map() - training_args.model_init_kwargs["quantization_config"] = quantization_config ################ # Dataset @@ -146,6 +140,7 @@ def convert_to_rgb(example): reward_funcs=[think_format_reward, accuracy_reward], train_dataset=train_dataset, eval_dataset=eval_dataset, + quantization_config=get_quantization_config(model_args), peft_config=get_peft_config(model_args), ) diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py index 333787c5a6c..6aa6b83dd48 100644 --- a/trl/trainer/dpo_trainer.py +++ b/trl/trainer/dpo_trainer.py @@ -539,7 +539,7 @@ def __init__( # Model if isinstance(model, str): - model_init_kwargs = args.model_init_kwargs or {} + model_init_kwargs = dict(args.model_init_kwargs or {}) # copy to avoid mutating model_init_kwargs if quantization_config is not None: if "quantization_config" in model_init_kwargs: raise ValueError( @@ -821,7 +821,9 @@ def __init__( # memory during training. self.ref_model = None else: - ref_model_init_kwargs = args.model_init_kwargs or {} + ref_model_init_kwargs = dict(args.model_init_kwargs or {}) # copy to avoid mutating model_init_kwargs + if quantization_config is not None: + ref_model_init_kwargs["quantization_config"] = quantization_config # Distributed training requires device_map=None ("auto" fails) if self.args.distributed_state.distributed_type in ["MULTI_GPU", "DEEPSPEED"]: ref_model_init_kwargs["device_map"] = None diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 680df67822d..0050da79e67 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -298,7 +298,7 @@ def __init__( # Model if isinstance(model, str): - model_init_kwargs = args.model_init_kwargs or {} + model_init_kwargs = dict(args.model_init_kwargs or {}) # copy to avoid mutating model_init_kwargs if quantization_config is not None: if "quantization_config" in model_init_kwargs: raise ValueError( @@ -775,7 +775,9 @@ def __init__( self.ref_model = None else: # For deepspeed, fsdp or non-distributed models, create a reference model from scratch - model_init_kwargs = args.model_init_kwargs or {} + model_init_kwargs = dict(args.model_init_kwargs or {}) # copy to avoid mutating model_init_kwargs + if quantization_config is not None: + model_init_kwargs["quantization_config"] = quantization_config # Distributed training requires device_map=None ("auto" fails) if self.args.distributed_state.distributed_type in ["MULTI_GPU", "DEEPSPEED"]: model_init_kwargs["device_map"] = None diff --git a/trl/trainer/reward_trainer.py b/trl/trainer/reward_trainer.py index 215323b3270..49ff7c0cd43 100644 --- a/trl/trainer/reward_trainer.py +++ b/trl/trainer/reward_trainer.py @@ -364,7 +364,7 @@ def __init__( # be done before loading the model to ensure reproducibility. set_seed(args.seed) if isinstance(model, str): - model_init_kwargs = args.model_init_kwargs or {} + model_init_kwargs = dict(args.model_init_kwargs or {}) # copy to avoid mutating model_init_kwargs if quantization_config is not None: if "quantization_config" in model_init_kwargs: raise ValueError( diff --git a/trl/trainer/rloo_trainer.py b/trl/trainer/rloo_trainer.py index 894406fb8da..2b1142dd16b 100644 --- a/trl/trainer/rloo_trainer.py +++ b/trl/trainer/rloo_trainer.py @@ -244,7 +244,7 @@ def __init__( # Model if isinstance(model, str): - model_init_kwargs = args.model_init_kwargs or {} + model_init_kwargs = dict(args.model_init_kwargs or {}) # copy to avoid mutating model_init_kwargs if quantization_config is not None: if "quantization_config" in model_init_kwargs: raise ValueError( @@ -547,7 +547,9 @@ def __init__( self.ref_model = None else: # For deepspeed, fsdp or non-distributed models, create a reference model from scratch - model_init_kwargs = args.model_init_kwargs or {} + model_init_kwargs = dict(args.model_init_kwargs or {}) # copy to avoid mutating model_init_kwargs + if quantization_config is not None: + model_init_kwargs["quantization_config"] = quantization_config # Distributed training requires device_map=None ("auto" fails) if self.args.distributed_state.distributed_type in ["MULTI_GPU", "DEEPSPEED"]: model_init_kwargs["device_map"] = None diff --git a/trl/trainer/sft_trainer.py b/trl/trainer/sft_trainer.py index 40b1e4b2b0c..6197d626d66 100644 --- a/trl/trainer/sft_trainer.py +++ b/trl/trainer/sft_trainer.py @@ -960,7 +960,7 @@ def __init__( # Model if isinstance(model, str): - model_init_kwargs = args.model_init_kwargs or {} + model_init_kwargs = dict(args.model_init_kwargs or {}) # copy to avoid mutating model_init_kwargs if quantization_config is not None: if "quantization_config" in model_init_kwargs: raise ValueError( From 7be97c47a0a350ac79ce8166bdbf845fd48c9494 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Fri, 26 Jun 2026 15:48:39 +0000 Subject: [PATCH 5/5] update notebooks --- examples/notebooks/grpo_trl_lora_qlora.ipynb | 3044 +++++++++--------- examples/notebooks/sft_trl_lora_qlora.ipynb | 57 +- 2 files changed, 1546 insertions(+), 1555 deletions(-) diff --git a/examples/notebooks/grpo_trl_lora_qlora.ipynb b/examples/notebooks/grpo_trl_lora_qlora.ipynb index 80375e2b4e3..fdff2cc44f1 100644 --- a/examples/notebooks/grpo_trl_lora_qlora.ipynb +++ b/examples/notebooks/grpo_trl_lora_qlora.ipynb @@ -1,1638 +1,1626 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "27ozP4Uy-Cz2" - }, - "source": [ - "# Group Relative Policy Optimization (GRPO) with LoRA/QLoRA using TRL — on a Free Colab Notebook\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/notebooks/grpo_trl_lora_qlora.ipynb)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eOjY4AR1-QnF" - }, - "source": [ - "![trl banner](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl_banner_dark.png)\n", - "\n", - "Easily fine-tune **Large Language Models (LLMs)** or **Vision-Language Models (VLMs)** with **LoRA** or **QLoRA** using the [**Transformers Reinforcement Learning (TRL)**](https://github.com/huggingface/trl) library by Hugging Face and Group Relative Policy Optimization (GRPO) — all within a **free Google Colab notebook** powered by a **T4 GPU**.\n", - "\n", - "Thanks to the **built-in memory and training optimizations in TRL**, including LoRA, quantization, gradient checkpointing, and optimized attention kernels, it is possible to **fine-tune a 7B model on a free T4** with a **~7× reduction in memory consumption** compared to naive FP16 training.\n", - "\n", - "- [TRL GitHub Repository](https://github.com/huggingface/trl) — star us to support the project! \n", - "- [Official TRL Examples](https://huggingface.co/docs/trl/example_overview) \n", - "- [Community Tutorials](https://huggingface.co/docs/trl/community_tutorials)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "w2TnJ6ta-2zj" - }, - "source": [ - "## Key concepts\n", - "\n", - "- **GRPO**: A reinforcement learning algorithm that optimizes a policy by comparing multiple generated responses for the same prompt and updating the model based on their relative rewards, without requiring a separate value model.\n", - "- **LoRA**: Updates only a few low-rank parameters, reducing training cost and memory.\n", - "- **QLoRA**: A quantized version of LoRA that enables even larger models to fit on small GPUs.\n", - "- **TRL**: The Hugging Face library that makes fine-tuning and reinforcement learning simple and efficient.\n", - "\n", - "Learn how to perform **GRPO (Group Relative Policy Optimization)** with **LoRA/QLoRA** using **TRL**." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EzScUBxoT4Nt" - }, - "source": [ - "This table demonstrates how **progressively enabling efficiency techniques** affects **memory usage** and **training throughput** across different hardware configurations. \n", - "The techniques range from naive FP16 training to **LoRA, quantization, Liger kernels, paged_adamw_8bit, and gradient checkpointing**.\n", - "\n", - "| Configuration | LoRA | Quant | Liger | Optimizer | Grad. Ckpt | attn_impl | VRAM (T4) GB | VRAM (A100-40GB)| VRAM (A100-80GB) | Tokens/s (T4) | Tokens/s (A100-40GB) | Tokens/s (A100-80GB) | Status (T4) |\n", - "|--------------|------|-------|-------|-----------|------------|-----------|---------------|----------------|---------|---------|---------------|------------------|-------------|\n", - "| **Worst (naive FP16)** | ❌ | ❌ | ❌ | AdamW | ❌ | eager | OOM | OOM | 62 GB | - | - | 0.06 it/s | ❌ |\n", - "| **Best (all optimizations)** | ✅ | ✅ | ✅ | paged_adamw_8bit | ✅ | sdpa | 9.2 GB | 9.6 GB | 9.6 GB | 0.01 it/s | 0.03 it/s | 0.04 it/s | ✅ |\n", - "\n", - "With all efficiency techniques enabled, **memory usage on Colab T4 is reduced by ~7×**, making it possible to **fine-tune a 7B model on free Colab** where naive FP16 training would fail.\n", - "\n", - "> A small trade-off in training speed is observed, but the **VRAM reduction is the key enabler**. For faster training on compatible hardware, **vLLM** can also be leveraged.\n", - "\n", - "> 💡 Note: For a fair comparison, the number of generations and the batch size were not changed." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9RFq6Op7rjc3" - }, - "source": [ - "## Install dependencies\n", - "\n", - "We'll install **TRL** with the **PEFT** extra, which ensures all main dependencies such as **Transformers** and **PEFT** (a package for parameter-efficient fine-tuning, e.g., LoRA/QLoRA) are included. Additionally, we'll install **trackio** to log and monitor our experiments, **bitsandbytes** to enable quantization of LLMs, reducing memory consumption for both inference and training, and **liger-kernel** for more efficient training." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c2jy45nfWbdo" - }, - "outputs": [], - "source": [ - "!pip install -Uq \"trl[peft]\" bitsandbytes trackio math_verify liger-kernel" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "B33zJG_Q_qb3" - }, - "source": [ - "### Log in to Hugging Face\n", - "\n", - "Log in to your **Hugging Face** account to save your fine-tuned model, track your experiment results directly on the Hub or access gated models. You can find your **access token** on your [account settings page](https://huggingface.co/settings/tokens)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "referenced_widgets": [ - "eec717d21e734c4da066763b4a6add7e" - ] - }, - "id": "8zqnTyUDWbdo", - "outputId": "62d71aaf-352b-4736-acb9-189d78654718" - }, - "outputs": [], - "source": [ - "from huggingface_hub import notebook_login\n", - "\n", - "notebook_login()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cTEw4xlFrhnQ" - }, - "source": [ - "## Load Dataset\n", - "\n", - "In this step, we load the [**AI-MO/NuminaMath-TIR**](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR) dataset from the Hugging Face Hub using the `datasets` library.\n", - "This dataset focuses on **mathematical reasoning**, featuring problems that require step-by-step logical solutions.\n", - "By fine-tuning a model that does not yet exhibit strong reasoning capabilities, it can learn to **generate structured reasoning steps**, enhancing both the model's **accuracy** and **interpretability** on math-related tasks.\n", - "\n", - "For efficiency, we'll load only a **small portion of the training split**:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zU5icx67Wbdp", - "outputId": "6480b287-dc0e-4e79-feda-f5e4f41d2a82" - }, - "outputs": [], - "source": [ - "from datasets import load_dataset\n", - "\n", - "dataset_name = 'AI-MO/NuminaMath-TIR'\n", - "train_dataset = load_dataset(dataset_name, split='train[:5%]')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "P1AIokQrBEGw" - }, - "source": [ - "Let's check the structure of the dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ff6Gx1TWWbdp", - "outputId": "30d49bed-273a-47d9-d131-a677ca5a8b65" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset({\n", - " features: ['problem', 'solution', 'messages'],\n", - " num_rows: 3622\n", - "})\n" - ] - } - ], - "source": [ - "print(train_dataset)" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "27ozP4Uy-Cz2" + }, + "source": [ + "# Group Relative Policy Optimization (GRPO) with LoRA/QLoRA using TRL — on a Free Colab Notebook\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/notebooks/grpo_trl_lora_qlora.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eOjY4AR1-QnF" + }, + "source": [ + "![trl banner](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl_banner_dark.png)\n", + "\n", + "Easily fine-tune **Large Language Models (LLMs)** or **Vision-Language Models (VLMs)** with **LoRA** or **QLoRA** using the [**Transformers Reinforcement Learning (TRL)**](https://github.com/huggingface/trl) library by Hugging Face and Group Relative Policy Optimization (GRPO) — all within a **free Google Colab notebook** powered by a **T4 GPU**.\n", + "\n", + "Thanks to the **built-in memory and training optimizations in TRL**, including LoRA, quantization, gradient checkpointing, and optimized attention kernels, it is possible to **fine-tune a 7B model on a free T4** with a **~7× reduction in memory consumption** compared to naive FP16 training.\n", + "\n", + "- [TRL GitHub Repository](https://github.com/huggingface/trl) — star us to support the project! \n", + "- [Official TRL Examples](https://huggingface.co/docs/trl/example_overview) \n", + "- [Community Tutorials](https://huggingface.co/docs/trl/community_tutorials)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w2TnJ6ta-2zj" + }, + "source": [ + "## Key concepts\n", + "\n", + "- **GRPO**: A reinforcement learning algorithm that optimizes a policy by comparing multiple generated responses for the same prompt and updating the model based on their relative rewards, without requiring a separate value model.\n", + "- **LoRA**: Updates only a few low-rank parameters, reducing training cost and memory.\n", + "- **QLoRA**: A quantized version of LoRA that enables even larger models to fit on small GPUs.\n", + "- **TRL**: The Hugging Face library that makes fine-tuning and reinforcement learning simple and efficient.\n", + "\n", + "Learn how to perform **GRPO (Group Relative Policy Optimization)** with **LoRA/QLoRA** using **TRL**." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EzScUBxoT4Nt" + }, + "source": [ + "This table demonstrates how **progressively enabling efficiency techniques** affects **memory usage** and **training throughput** across different hardware configurations. \n", + "The techniques range from naive FP16 training to **LoRA, quantization, Liger kernels, paged_adamw_8bit, and gradient checkpointing**.\n", + "\n", + "| Configuration | LoRA | Quant | Liger | Optimizer | Grad. Ckpt | attn_impl | VRAM (T4) GB | VRAM (A100-40GB)| VRAM (A100-80GB) | Tokens/s (T4) | Tokens/s (A100-40GB) | Tokens/s (A100-80GB) | Status (T4) |\n", + "|--------------|------|-------|-------|-----------|------------|-----------|---------------|----------------|---------|---------|---------------|------------------|-------------|\n", + "| **Worst (naive FP16)** | ❌ | ❌ | ❌ | AdamW | ❌ | eager | OOM | OOM | 62 GB | - | - | 0.06 it/s | ❌ |\n", + "| **Best (all optimizations)** | ✅ | ✅ | ✅ | paged_adamw_8bit | ✅ | sdpa | 9.2 GB | 9.6 GB | 9.6 GB | 0.01 it/s | 0.03 it/s | 0.04 it/s | ✅ |\n", + "\n", + "With all efficiency techniques enabled, **memory usage on Colab T4 is reduced by ~7×**, making it possible to **fine-tune a 7B model on free Colab** where naive FP16 training would fail.\n", + "\n", + "> A small trade-off in training speed is observed, but the **VRAM reduction is the key enabler**. For faster training on compatible hardware, **vLLM** can also be leveraged.\n", + "\n", + "> 💡 Note: For a fair comparison, the number of generations and the batch size were not changed." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9RFq6Op7rjc3" + }, + "source": [ + "## Install dependencies\n", + "\n", + "We'll install **TRL** with the **PEFT** extra, which ensures all main dependencies such as **Transformers** and **PEFT** (a package for parameter-efficient fine-tuning, e.g., LoRA/QLoRA) are included. Additionally, we'll install **trackio** to log and monitor our experiments, **bitsandbytes** to enable quantization of LLMs, reducing memory consumption for both inference and training, and **liger-kernel** for more efficient training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c2jy45nfWbdo" + }, + "outputs": [], + "source": [ + "!pip install -Uq \"trl[peft]\" bitsandbytes trackio math_verify liger-kernel" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "B33zJG_Q_qb3" + }, + "source": [ + "### Log in to Hugging Face\n", + "\n", + "Log in to your **Hugging Face** account to save your fine-tuned model, track your experiment results directly on the Hub or access gated models. You can find your **access token** on your [account settings page](https://huggingface.co/settings/tokens)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "referenced_widgets": [ + "eec717d21e734c4da066763b4a6add7e" + ] }, + "id": "8zqnTyUDWbdo", + "outputId": "62d71aaf-352b-4736-acb9-189d78654718" + }, + "outputs": [], + "source": [ + "from huggingface_hub import notebook_login\n", + "\n", + "notebook_login()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cTEw4xlFrhnQ" + }, + "source": [ + "## Load Dataset\n", + "\n", + "In this step, we load the [**AI-MO/NuminaMath-TIR**](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR) dataset from the Hugging Face Hub using the `datasets` library.\n", + "This dataset focuses on **mathematical reasoning**, featuring problems that require step-by-step logical solutions.\n", + "By fine-tuning a model that does not yet exhibit strong reasoning capabilities, it can learn to **generate structured reasoning steps**, enhancing both the model's **accuracy** and **interpretability** on math-related tasks.\n", + "\n", + "For efficiency, we'll load only a **small portion of the training split**:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zU5icx67Wbdp", + "outputId": "6480b287-dc0e-4e79-feda-f5e4f41d2a82" + }, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "dataset_name = 'AI-MO/NuminaMath-TIR'\n", + "train_dataset = load_dataset(dataset_name, split='train[:5%]')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "P1AIokQrBEGw" + }, + "source": [ + "Let's check the structure of the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ff6Gx1TWWbdp", + "outputId": "30d49bed-273a-47d9-d131-a677ca5a8b65" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "QY5hkOqDBGns" - }, - "source": [ - "Let's check one sample:" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset({\n", + " features: ['problem', 'solution', 'messages'],\n", + " num_rows: 3622\n", + "})\n" + ] + } + ], + "source": [ + "print(train_dataset)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QY5hkOqDBGns" + }, + "source": [ + "Let's check one sample:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-y9c7i29Wbdp", + "outputId": "760662ea-4db4-4b8e-c234-92ae2c8ecc17" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-y9c7i29Wbdp", - "outputId": "760662ea-4db4-4b8e-c234-92ae2c8ecc17" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'problem': 'What is the coefficient of $x^2y^6$ in the expansion of $\\\\left(\\\\frac{3}{5}x-\\\\frac{y}{2}\\\\right)^8$? Express your answer as a common fraction.', 'solution': \"To determine the coefficient of \\\\(x^2y^6\\\\) in the expansion of \\\\(\\\\left(\\\\frac{3}{5}x - \\\\frac{y}{2}\\\\right)^8\\\\), we can use the binomial theorem.\\n\\nThe binomial theorem states:\\n\\\\[\\n(a + b)^n = \\\\sum_{k=0}^{n} \\\\binom{n}{k} a^{n-k} b^k\\n\\\\]\\n\\nIn this case, \\\\(a = \\\\frac{3}{5}x\\\\), \\\\(b = -\\\\frac{y}{2}\\\\), and \\\\(n = 8\\\\).\\n\\nWe are interested in the term that contains \\\\(x^2y^6\\\\). In the general term of the binomial expansion:\\n\\\\[\\n\\\\binom{8}{k} \\\\left(\\\\frac{3}{5}x\\\\right)^{8-k} \\\\left(-\\\\frac{y}{2}\\\\right)^k\\n\\\\]\\n\\nTo get \\\\(x^2\\\\), we need \\\\(8 - k = 2\\\\), thus \\\\(k = 6\\\\).\\n\\nSubstituting \\\\(k = 6\\\\) into the expression:\\n\\\\[\\n\\\\binom{8}{6} \\\\left(\\\\frac{3}{5}x\\\\right)^{8-6} \\\\left(-\\\\frac{y}{2}\\\\right)^6 = \\\\binom{8}{6} \\\\left(\\\\frac{3}{5}x\\\\right)^2 \\\\left(-\\\\frac{y}{2}\\\\right)^6\\n\\\\]\\n\\nNow, we will compute each part of this expression.\\n\\n1. Calculate the binomial coefficient \\\\(\\\\binom{8}{6}\\\\).\\n2. Compute \\\\(\\\\left(\\\\frac{3}{5}\\\\right)^2\\\\).\\n3. Compute \\\\(\\\\left(-\\\\frac{y}{2}\\\\right)^6\\\\).\\n4. Combine everything together to get the coefficient of \\\\(x^2y^6\\\\).\\n\\nLet's compute these in Python.\\n```python\\nfrom math import comb\\n\\n# Given values\\nn = 8\\nk = 6\\n\\n# Calculate the binomial coefficient\\nbinom_coeff = comb(n, k)\\n\\n# Compute (3/5)^2\\na_term = (3/5)**2\\n\\n# Compute (-1/2)^6\\nb_term = (-1/2)**6\\n\\n# Combine terms to get the coefficient of x^2y^6\\ncoefficient = binom_coeff * a_term * b_term\\nprint(coefficient)\\n```\\n```output\\n0.1575\\n```\\nThe coefficient of \\\\(x^2y^6\\\\) in the expansion of \\\\(\\\\left(\\\\frac{3}{5}x - \\\\frac{y}{2}\\\\right)^8\\\\) is \\\\(0.1575\\\\). To express this as a common fraction, we recognize that:\\n\\n\\\\[ 0.1575 = \\\\frac{1575}{10000} = \\\\frac{63}{400} \\\\]\\n\\nThus, the coefficient can be expressed as:\\n\\n\\\\[\\n\\\\boxed{\\\\frac{63}{400}}\\n\\\\]\", 'messages': [{'content': 'What is the coefficient of $x^2y^6$ in the expansion of $\\\\left(\\\\frac{3}{5}x-\\\\frac{y}{2}\\\\right)^8$? Express your answer as a common fraction.', 'role': 'user'}, {'content': \"To determine the coefficient of \\\\(x^2y^6\\\\) in the expansion of \\\\(\\\\left(\\\\frac{3}{5}x - \\\\frac{y}{2}\\\\right)^8\\\\), we can use the binomial theorem.\\n\\nThe binomial theorem states:\\n\\\\[\\n(a + b)^n = \\\\sum_{k=0}^{n} \\\\binom{n}{k} a^{n-k} b^k\\n\\\\]\\n\\nIn this case, \\\\(a = \\\\frac{3}{5}x\\\\), \\\\(b = -\\\\frac{y}{2}\\\\), and \\\\(n = 8\\\\).\\n\\nWe are interested in the term that contains \\\\(x^2y^6\\\\). In the general term of the binomial expansion:\\n\\\\[\\n\\\\binom{8}{k} \\\\left(\\\\frac{3}{5}x\\\\right)^{8-k} \\\\left(-\\\\frac{y}{2}\\\\right)^k\\n\\\\]\\n\\nTo get \\\\(x^2\\\\), we need \\\\(8 - k = 2\\\\), thus \\\\(k = 6\\\\).\\n\\nSubstituting \\\\(k = 6\\\\) into the expression:\\n\\\\[\\n\\\\binom{8}{6} \\\\left(\\\\frac{3}{5}x\\\\right)^{8-6} \\\\left(-\\\\frac{y}{2}\\\\right)^6 = \\\\binom{8}{6} \\\\left(\\\\frac{3}{5}x\\\\right)^2 \\\\left(-\\\\frac{y}{2}\\\\right)^6\\n\\\\]\\n\\nNow, we will compute each part of this expression.\\n\\n1. Calculate the binomial coefficient \\\\(\\\\binom{8}{6}\\\\).\\n2. Compute \\\\(\\\\left(\\\\frac{3}{5}\\\\right)^2\\\\).\\n3. Compute \\\\(\\\\left(-\\\\frac{y}{2}\\\\right)^6\\\\).\\n4. Combine everything together to get the coefficient of \\\\(x^2y^6\\\\).\\n\\nLet's compute these in Python.\\n```python\\nfrom math import comb\\n\\n# Given values\\nn = 8\\nk = 6\\n\\n# Calculate the binomial coefficient\\nbinom_coeff = comb(n, k)\\n\\n# Compute (3/5)^2\\na_term = (3/5)**2\\n\\n# Compute (-1/2)^6\\nb_term = (-1/2)**6\\n\\n# Combine terms to get the coefficient of x^2y^6\\ncoefficient = binom_coeff * a_term * b_term\\nprint(coefficient)\\n```\\n```output\\n0.1575\\n```\\nThe coefficient of \\\\(x^2y^6\\\\) in the expansion of \\\\(\\\\left(\\\\frac{3}{5}x - \\\\frac{y}{2}\\\\right)^8\\\\) is \\\\(0.1575\\\\). To express this as a common fraction, we recognize that:\\n\\n\\\\[ 0.1575 = \\\\frac{1575}{10000} = \\\\frac{63}{400} \\\\]\\n\\nThus, the coefficient can be expressed as:\\n\\n\\\\[\\n\\\\boxed{\\\\frac{63}{400}}\\n\\\\]\", 'role': 'assistant'}]}\n" - ] - } - ], - "source": [ - "print(train_dataset[0])" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "{'problem': 'What is the coefficient of $x^2y^6$ in the expansion of $\\\\left(\\\\frac{3}{5}x-\\\\frac{y}{2}\\\\right)^8$? Express your answer as a common fraction.', 'solution': \"To determine the coefficient of \\\\(x^2y^6\\\\) in the expansion of \\\\(\\\\left(\\\\frac{3}{5}x - \\\\frac{y}{2}\\\\right)^8\\\\), we can use the binomial theorem.\\n\\nThe binomial theorem states:\\n\\\\[\\n(a + b)^n = \\\\sum_{k=0}^{n} \\\\binom{n}{k} a^{n-k} b^k\\n\\\\]\\n\\nIn this case, \\\\(a = \\\\frac{3}{5}x\\\\), \\\\(b = -\\\\frac{y}{2}\\\\), and \\\\(n = 8\\\\).\\n\\nWe are interested in the term that contains \\\\(x^2y^6\\\\). In the general term of the binomial expansion:\\n\\\\[\\n\\\\binom{8}{k} \\\\left(\\\\frac{3}{5}x\\\\right)^{8-k} \\\\left(-\\\\frac{y}{2}\\\\right)^k\\n\\\\]\\n\\nTo get \\\\(x^2\\\\), we need \\\\(8 - k = 2\\\\), thus \\\\(k = 6\\\\).\\n\\nSubstituting \\\\(k = 6\\\\) into the expression:\\n\\\\[\\n\\\\binom{8}{6} \\\\left(\\\\frac{3}{5}x\\\\right)^{8-6} \\\\left(-\\\\frac{y}{2}\\\\right)^6 = \\\\binom{8}{6} \\\\left(\\\\frac{3}{5}x\\\\right)^2 \\\\left(-\\\\frac{y}{2}\\\\right)^6\\n\\\\]\\n\\nNow, we will compute each part of this expression.\\n\\n1. Calculate the binomial coefficient \\\\(\\\\binom{8}{6}\\\\).\\n2. Compute \\\\(\\\\left(\\\\frac{3}{5}\\\\right)^2\\\\).\\n3. Compute \\\\(\\\\left(-\\\\frac{y}{2}\\\\right)^6\\\\).\\n4. Combine everything together to get the coefficient of \\\\(x^2y^6\\\\).\\n\\nLet's compute these in Python.\\n```python\\nfrom math import comb\\n\\n# Given values\\nn = 8\\nk = 6\\n\\n# Calculate the binomial coefficient\\nbinom_coeff = comb(n, k)\\n\\n# Compute (3/5)^2\\na_term = (3/5)**2\\n\\n# Compute (-1/2)^6\\nb_term = (-1/2)**6\\n\\n# Combine terms to get the coefficient of x^2y^6\\ncoefficient = binom_coeff * a_term * b_term\\nprint(coefficient)\\n```\\n```output\\n0.1575\\n```\\nThe coefficient of \\\\(x^2y^6\\\\) in the expansion of \\\\(\\\\left(\\\\frac{3}{5}x - \\\\frac{y}{2}\\\\right)^8\\\\) is \\\\(0.1575\\\\). To express this as a common fraction, we recognize that:\\n\\n\\\\[ 0.1575 = \\\\frac{1575}{10000} = \\\\frac{63}{400} \\\\]\\n\\nThus, the coefficient can be expressed as:\\n\\n\\\\[\\n\\\\boxed{\\\\frac{63}{400}}\\n\\\\]\", 'messages': [{'content': 'What is the coefficient of $x^2y^6$ in the expansion of $\\\\left(\\\\frac{3}{5}x-\\\\frac{y}{2}\\\\right)^8$? Express your answer as a common fraction.', 'role': 'user'}, {'content': \"To determine the coefficient of \\\\(x^2y^6\\\\) in the expansion of \\\\(\\\\left(\\\\frac{3}{5}x - \\\\frac{y}{2}\\\\right)^8\\\\), we can use the binomial theorem.\\n\\nThe binomial theorem states:\\n\\\\[\\n(a + b)^n = \\\\sum_{k=0}^{n} \\\\binom{n}{k} a^{n-k} b^k\\n\\\\]\\n\\nIn this case, \\\\(a = \\\\frac{3}{5}x\\\\), \\\\(b = -\\\\frac{y}{2}\\\\), and \\\\(n = 8\\\\).\\n\\nWe are interested in the term that contains \\\\(x^2y^6\\\\). In the general term of the binomial expansion:\\n\\\\[\\n\\\\binom{8}{k} \\\\left(\\\\frac{3}{5}x\\\\right)^{8-k} \\\\left(-\\\\frac{y}{2}\\\\right)^k\\n\\\\]\\n\\nTo get \\\\(x^2\\\\), we need \\\\(8 - k = 2\\\\), thus \\\\(k = 6\\\\).\\n\\nSubstituting \\\\(k = 6\\\\) into the expression:\\n\\\\[\\n\\\\binom{8}{6} \\\\left(\\\\frac{3}{5}x\\\\right)^{8-6} \\\\left(-\\\\frac{y}{2}\\\\right)^6 = \\\\binom{8}{6} \\\\left(\\\\frac{3}{5}x\\\\right)^2 \\\\left(-\\\\frac{y}{2}\\\\right)^6\\n\\\\]\\n\\nNow, we will compute each part of this expression.\\n\\n1. Calculate the binomial coefficient \\\\(\\\\binom{8}{6}\\\\).\\n2. Compute \\\\(\\\\left(\\\\frac{3}{5}\\\\right)^2\\\\).\\n3. Compute \\\\(\\\\left(-\\\\frac{y}{2}\\\\right)^6\\\\).\\n4. Combine everything together to get the coefficient of \\\\(x^2y^6\\\\).\\n\\nLet's compute these in Python.\\n```python\\nfrom math import comb\\n\\n# Given values\\nn = 8\\nk = 6\\n\\n# Calculate the binomial coefficient\\nbinom_coeff = comb(n, k)\\n\\n# Compute (3/5)^2\\na_term = (3/5)**2\\n\\n# Compute (-1/2)^6\\nb_term = (-1/2)**6\\n\\n# Combine terms to get the coefficient of x^2y^6\\ncoefficient = binom_coeff * a_term * b_term\\nprint(coefficient)\\n```\\n```output\\n0.1575\\n```\\nThe coefficient of \\\\(x^2y^6\\\\) in the expansion of \\\\(\\\\left(\\\\frac{3}{5}x - \\\\frac{y}{2}\\\\right)^8\\\\) is \\\\(0.1575\\\\). To express this as a common fraction, we recognize that:\\n\\n\\\\[ 0.1575 = \\\\frac{1575}{10000} = \\\\frac{63}{400} \\\\]\\n\\nThus, the coefficient can be expressed as:\\n\\n\\\\[\\n\\\\boxed{\\\\frac{63}{400}}\\n\\\\]\", 'role': 'assistant'}]}\n" + ] + } + ], + "source": [ + "print(train_dataset[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DiqBlxK_A0SD" + }, + "source": [ + "We will adapt our dataset to a conversational format using a custom system prompt, guiding the LLM to generate both step-by-step reasoning and the final answer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "RWxK5xFKWbdp" + }, + "outputs": [], + "source": [ + "SYSTEM_PROMPT = (\n", + " \"A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant \"\n", + " \"first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning \"\n", + " \"process is enclosed strictly within and tags. \"\n", + " \"After closing , the assistant MUST provide the final answer in plain text.\"\n", + ")\n", + "\n", + "\n", + "def make_conversation(example):\n", + " return {\n", + " \"prompt\": [\n", + " {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n", + " {\"role\": \"user\", \"content\": example[\"problem\"]},\n", + " ],\n", + " }\n", + "\n", + "train_dataset = train_dataset.map(make_conversation)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sND566XAC0kD" + }, + "source": [ + "Let's take a look at an example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Q-kHUmpMWbdp", + "outputId": "452beb3a-1091-46d4-997e-04b91562d66c" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "DiqBlxK_A0SD" - }, - "source": [ - "We will adapt our dataset to a conversational format using a custom system prompt, guiding the LLM to generate both step-by-step reasoning and the final answer." - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'content': 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process is enclosed strictly within and tags. After closing , the assistant MUST provide the final answer in plain text.', 'role': 'system'}, {'content': 'What is the coefficient of $x^2y^6$ in the expansion of $\\\\left(\\\\frac{3}{5}x-\\\\frac{y}{2}\\\\right)^8$? Express your answer as a common fraction.', 'role': 'user'}]\n" + ] + } + ], + "source": [ + "print(train_dataset[0]['prompt'])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bw0qcp-CC3G0" + }, + "source": [ + "We'll remove the `messages` and `problem` columns, as we only need the custom `prompt` column and `solution` to verify the generated answer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SzbF3hdRWbdp", + "outputId": "bd59a383-1d4e-4020-c232-79ce66073fd1" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "RWxK5xFKWbdp" - }, - "outputs": [], - "source": [ - "SYSTEM_PROMPT = (\n", - " \"A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant \"\n", - " \"first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning \"\n", - " \"process is enclosed strictly within and tags. \"\n", - " \"After closing , the assistant MUST provide the final answer in plain text.\"\n", - ")\n", - "\n", - "\n", - "def make_conversation(example):\n", - " return {\n", - " \"prompt\": [\n", - " {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n", - " {\"role\": \"user\", \"content\": example[\"problem\"]},\n", - " ],\n", - " }\n", - "\n", - "train_dataset = train_dataset.map(make_conversation)" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset({\n", + " features: ['solution', 'prompt'],\n", + " num_rows: 3622\n", + "})\n" + ] + } + ], + "source": [ + "train_dataset = train_dataset.remove_columns(['messages', 'problem'])\n", + "print(train_dataset)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvs5rjQBr7af" + }, + "source": [ + "## Load model and configure LoRA/QLoRA\n", + "\n", + "Below, choose your **preferred model**. All of the options have been tested on **free Colab instances**.\n", + "\n", + "> 💡 Note: Some models, such as Qwen2.5 and Qwen3, are known to have been pretrained on data that improves their math performance. Be cautious when selecting the appropriate model for training to ensure meaningful fine-tuning results ([source](https://thinkingmachines.ai/blog/lora/))." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7_uaW3JfWbdp" + }, + "outputs": [], + "source": [ + "# Select one model below by uncommenting the line you want to use 👇\n", + "## Qwen\n", + "model_id, output_dir = \"Qwen/Qwen2-7B-Instruct\", \"t4-Qwen2-7B-Instruct-GRPO\" # ✅ ~9.2GB VRAM\n", + "# model_id, output_dir = \"unsloth/qwen3-14b-unsloth-bnb-4bit\", \"qwen3-14b-unsloth-bnb-4bit-GRPO\" # ⚠️ OOM with this config; fits if GRPO params are reduced\n", + "# model_id, output_dir = \"Qwen/Qwen3-8B\", \"Qwen3-8B-GRPO\" # ✅ ~9.9GB VRAM\n", + "# model_id, output_dir = \"Qwen/Qwen2.5-7B-Instruct\", \"Qwen2.5-7B-Instruct-GRPO\" # ✅ ~9.2GB VRAM\n", + "\n", + "## Llama\n", + "# model_id, output_dir = \"meta-llama/Llama-3.2-3B-Instruct\", \"Llama-3.2-3B-Instruct-GRPO\" # ✅ ~5.7GB VRAM\n", + "# model_id, output_dir = \"meta-llama/Llama-3.1-8B-Instruct\", \"Llama-3.1-8B-Instruct-GRPO\" # ✅ ~9.5GB VRAM\n", + "\n", + "## LFM2.5\n", + "# model_id, output_dir = \"LiquidAI/LFM2.5-1.2B-Instruct\", \"LFM2.5-1.2B-Instruct-GRPO\" # ✅ ~1.12 GB VRAM" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aw__94OWDnER" + }, + "source": [ + "This notebook can be used with two fine-tuning methods. By default, it is set up for **QLoRA**, which includes quantization using `BitsAndBytesConfig`. If you prefer to use standard **LoRA** without quantization, simply set `quantization_config = None` below (training without quantization consumes more memory).\n", + "\n", + "Let's configure **QLoRA** by defining a `BitsAndBytesConfig`. We pass the model id and this config directly to the trainer, which loads and quantizes the model for us. We don't need to configure the tokenizer since the trainer takes care of that automatically." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "referenced_widgets": [ + "1130e5a744864ca5b5873731e4764983" + ] }, + "id": "o86TnTchWbdp", + "outputId": "77a7e6c8-0360-40f1-eea7-b941be031366" + }, + "outputs": [], + "source": [ + "import torch\n", + "from transformers import BitsAndBytesConfig\n", + "\n", + "# QLoRA: 4-bit quantization config passed to the trainer (set to None for plain LoRA).\n", + "quantization_config = BitsAndBytesConfig(\n", + " load_in_4bit=True, # Load the model in 4-bit precision to save memory\n", + " bnb_4bit_compute_dtype=torch.float16, # Data type used for internal computations in quantization\n", + " bnb_4bit_use_double_quant=True, # Use double quantization to improve accuracy\n", + " bnb_4bit_quant_type=\"nf4\", # Type of quantization. \"nf4\" is recommended for recent LLMs\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AM-G0_QmDyZC" + }, + "source": [ + "The following cell defines LoRA (or QLoRA if needed). When training with LoRA/QLoRA, we use a **base model** (the one selected above) and, instead of modifying its original weights, we fine-tune a **LoRA adapter**, a lightweight layer that enables efficient and memory-friendly training. The **`target_modules`** specify which parts of the model (e.g., attention or projection layers) will be adapted by LoRA during fine-tuning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WIz2pmX6Wbdp" + }, + "outputs": [], + "source": [ + "from peft import LoraConfig\n", + "\n", + "# You may need to update `target_modules` depending on the architecture of your chosen model.\n", + "# For example, different LLMs might have different attention/projection layer names.\n", + "peft_config = LoraConfig(\n", + " r=32,\n", + " lora_alpha=32,\n", + " target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\",],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "prKnAp-Esyiq" + }, + "source": [ + "## Train model\n", + "\n", + "GRPO requires **reward functions** to guide the learning process. For convenience, we can directly load pre-defined rewards from `trl.rewards`, which already includes a [collection of ready-to-use rewards](https://huggingface.co/docs/trl/rewards).\n", + "\n", + "If you want to create your own custom reward functions to teach the model, a reward function is simply a Python function that takes the generated completions and returns a list of floats. For example, the following function, which we use in this notebook, rewards completions that correctly follow the `` format:\n", + "\n", + "```python\n", + "def think_format_reward(completions: list[list[dict[str, str]]], **kwargs) -> list[float]:\n", + " pattern = r\"^(?!.*)(.*?).*$\"\n", + " completion_contents = [completion[0][\"content\"] for completion in completions]\n", + " matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completion_contents]\n", + " return [1.0 if match else 0.0 for match in matches]\n", + "```\n", + "\n", + "In this notebook, we will use both `think_format_reward`, which rewards completions that correctly follow the `` format, and `reasoning_accuracy_reward`, which evaluates the correctness of the model's solution to the mathematical problem. Together, these rewards guide the model to generate **structured reasoning** while producing **accurate answers**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lj42Qs5vWbdp" + }, + "outputs": [], + "source": [ + "from trl.rewards import think_format_reward, reasoning_accuracy_reward" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bFgYgxMbtbEZ" + }, + "source": [ + "We'll configure **GRPO** using `GRPOConfig`, keeping the parameters minimal so that the training can run on a free Colab instance. You can adjust these settings if you have access to more resources. For a complete list of available parameters and their descriptions, refer to the [TRL GRPOConfig documentation](https://huggingface.co/docs/trl/grpo_trainer#trl.GRPOConfig).\n", + "\n", + "> 💡 Note: TRL supports using **vLLM** for generation during GRPO training, which can significantly speed up training. However, it increases VRAM usage since a separate vLLM process is active to handle generation. In this notebook, we do not enable vLLM because we are using **QLoRA**, which updates the quantized vLLM model weights at every step. Enabling vLLM in this setup can cause weight precision issues and make convergence more challenging. The configuration includes the vLLM parameters in case you want to experiment with it. Learn more about vLLM integration in TRL [here](https://huggingface.co/docs/trl/main/en/vllm_integration)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JY11EQMhWbdp" + }, + "outputs": [], + "source": [ + "from trl import GRPOConfig\n", + "\n", + "# Configure training arguments using GRPOConfig\n", + "training_args = GRPOConfig(\n", + " # Model loading (passed to `from_pretrained` when the trainer loads the model)\n", + " model_init_kwargs={\n", + " \"attn_implementation\": \"sdpa\", # Change to Flash Attention if GPU has support\n", + " \"dtype\": \"float32\", # Change to bfloat16 if GPU has support\n", + " },\n", + "\n", + " # Training schedule / optimization\n", + " learning_rate=2e-5, # Learning rate for the optimizer\n", + " #num_train_epochs=1,\n", + " max_steps=500, # Number of dataset passes. For full trainings, use `num_train_epochs` instead\n", + "\n", + " # Parameters that control GRPO training (you can adapt them)\n", + " per_device_train_batch_size = 8,\n", + " max_completion_length=256, # default: 256 # Max completion length produced during training\n", + " num_generations=8, # default: 8 # Number of generations produced during trainig for comparison\n", + "\n", + " # Optimizations\n", + " optim = \"paged_adamw_8bit\", # Optimizer\n", + " use_liger_kernel=True, # Enable Liger kernel optimizations for faster training\n", + "\n", + " # Parameters related to reporting and saving\n", + " output_dir=output_dir, # Where to save model checkpoints and logs\n", + " logging_steps=10, # Log training metrics every N steps\n", + " report_to=\"trackio\", # Experiment tracking tool\n", + " trackio_space_id=output_dir, # HF Space where the experiment tracking will be saved\n", + " log_completions=False, # Return model completions during training\n", + "\n", + " # Hub integration\n", + " push_to_hub=True, # Automatically push the trained model to the Hugging Face Hub\n", + " # The model will be saved under your Hub account in the repository named `output_dir`\n", + " # vLLM params\n", + " #use_vllm=False, # Activate vLLM training for faster training\n", + " #vllm_mode='colocate',\n", + " #vllm_gpu_memory_utilization=0.1,\n", + " #vllm_enable_sleep_mode=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-9LlOAvWFSor" + }, + "source": [ + "Configure the `GRPOTrainer` by passing the previously defined `training_args`. To keep memory usage low, we are not using an evaluation dataset, but you can include one if desired. We also provide the reward functions that were imported earlier to guide the training process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iI_E9KCUWbdq" + }, + "outputs": [], + "source": [ + "from trl import GRPOTrainer\n", + "\n", + "trainer = GRPOTrainer(\n", + " model=model_id,\n", + " reward_funcs=[think_format_reward, reasoning_accuracy_reward],\n", + " args=training_args,\n", + " train_dataset=train_dataset,\n", + " quantization_config=quantization_config,\n", + " peft_config=peft_config,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8dY7bK8FGLhh" + }, + "source": [ + "Show memory stats before training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PEVRGlrAWbdq", + "outputId": "78fac9e4-4ae6-4836-bd10-c30b39059782" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "sND566XAC0kD" - }, - "source": [ - "Let's take a look at an example:" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "GPU = Tesla T4. Max memory = 14.741 GB.\n", + "6.773 GB of memory reserved.\n" + ] + } + ], + "source": [ + "gpu_stats = torch.cuda.get_device_properties(0)\n", + "start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n", + "max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)\n", + "\n", + "print(f\"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.\")\n", + "print(f\"{start_gpu_memory} GB of memory reserved.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z-5xPtfIGQL5" + }, + "source": [ + "And train!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Training on a T4 in Colab with the configuration defined in this notebook takes around 13 hours. If you're just experimenting, you can try the following quicker task ([source](https://huggingface.co/learn/llm-course/en/chapter12/5)):\n", + "\n", + "```python\n", + "dataset = load_dataset(\"mlabonne/smoltldr\")\n", + "\n", + "# Reward function\n", + "ideal_length = 50\n", + "\n", + "def reward_len(completions, **kwargs):\n", + " return [-abs(ideal_length - len(completion)) for completion in completions]\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zl7-PmoXWbdq", + "outputId": "f39c8c3c-43c2-4f2d-c98d-4c595ae1129f" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Q-kHUmpMWbdp", - "outputId": "452beb3a-1091-46d4-997e-04b91562d66c" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{'content': 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process is enclosed strictly within and tags. After closing , the assistant MUST provide the final answer in plain text.', 'role': 'system'}, {'content': 'What is the coefficient of $x^2y^6$ in the expansion of $\\\\left(\\\\frac{3}{5}x-\\\\frac{y}{2}\\\\right)^8$? Express your answer as a common fraction.', 'role': 'user'}]\n" - ] - } - ], - "source": [ - "print(train_dataset[0]['prompt'])" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "bw0qcp-CC3G0" - }, - "source": [ - "We'll remove the `messages` and `problem` columns, as we only need the custom `prompt` column and `solution` to verify the generated answer." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "* Trackio project initialized: huggingface\n", + "* Trackio metrics will be synced to Hugging Face Dataset: sergiopaniego/t4-Qwen2-7B-Instruct-GRPO-dataset\n", + "* Creating new space: https://huggingface.co/spaces/sergiopaniego/t4-Qwen2-7B-Instruct-GRPO\n", + "* View dashboard by going to: https://sergiopaniego-t4-Qwen2-7B-Instruct-GRPO.hf.space/\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SzbF3hdRWbdp", - "outputId": "bd59a383-1d4e-4020-c232-79ce66073fd1" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset({\n", - " features: ['solution', 'prompt'],\n", - " num_rows: 3622\n", - "})\n" - ] - } + "data": { + "text/html": [ + "
" ], - "source": [ - "train_dataset = train_dataset.remove_columns(['messages', 'problem'])\n", - "print(train_dataset)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvs5rjQBr7af" - }, - "source": [ - "## Load model and configure LoRA/QLoRA\n", - "\n", - "Below, choose your **preferred model**. All of the options have been tested on **free Colab instances**.\n", - "\n", - "> 💡 Note: Some models, such as Qwen2.5 and Qwen3, are known to have been pretrained on data that improves their math performance. Be cautious when selecting the appropriate model for training to ensure meaningful fine-tuning results ([source](https://thinkingmachines.ai/blog/lora/))." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7_uaW3JfWbdp" - }, - "outputs": [], - "source": [ - "# Select one model below by uncommenting the line you want to use 👇\n", - "## Qwen\n", - "model_id, output_dir = \"Qwen/Qwen2-7B-Instruct\", \"t4-Qwen2-7B-Instruct-GRPO\" # ✅ ~9.2GB VRAM\n", - "# model_id, output_dir = \"unsloth/qwen3-14b-unsloth-bnb-4bit\", \"qwen3-14b-unsloth-bnb-4bit-GRPO\" # ⚠️ OOM with this config; fits if GRPO params are reduced\n", - "# model_id, output_dir = \"Qwen/Qwen3-8B\", \"Qwen3-8B-GRPO\" # ✅ ~9.9GB VRAM\n", - "# model_id, output_dir = \"Qwen/Qwen2.5-7B-Instruct\", \"Qwen2.5-7B-Instruct-GRPO\" # ✅ ~9.2GB VRAM\n", - "\n", - "## Llama\n", - "# model_id, output_dir = \"meta-llama/Llama-3.2-3B-Instruct\", \"Llama-3.2-3B-Instruct-GRPO\" # ✅ ~5.7GB VRAM\n", - "# model_id, output_dir = \"meta-llama/Llama-3.1-8B-Instruct\", \"Llama-3.1-8B-Instruct-GRPO\" # ✅ ~9.5GB VRAM\n", - "\n", - "## LFM2.5\n", - "# model_id, output_dir = \"LiquidAI/LFM2.5-1.2B-Instruct\", \"LFM2.5-1.2B-Instruct-GRPO\" # ✅ ~1.12 GB VRAM" + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "markdown", - "metadata": { - "id": "aw__94OWDnER" - }, - "source": [ - "This notebook can be used with two fine-tuning methods. By default, it is set up for **QLoRA**, which includes quantization using `BitsAndBytesConfig`. If you prefer to use standard **LoRA** without quantization, simply comment out the `BitsAndBytesConfig` configuration (training without quantization consumes more memory).\n", - "\n", - "Let's load the selected model using `transformers`, configuring QLoRA via `bitsandbytes` (you can remove it if doing LoRA). We don't need to configure the tokenizer since the trainer takes care of that automatically." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "* Created new run: sergiopaniego-1766143600\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "referenced_widgets": [ - "1130e5a744864ca5b5873731e4764983" - ] - }, - "id": "o86TnTchWbdp", - "outputId": "77a7e6c8-0360-40f1-eea7-b941be031366" - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "1130e5a744864ca5b5873731e4764983", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Loading checkpoint shards: 0%| | 0/4 [00:00\n", + " \n", + " \n", + " [500/500 13:05:04, Epoch 0/1]\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
100.027900
20-0.011600
300.021500
400.033400
500.039400
600.010300
700.048200
800.067300
900.030600
1000.064000
1100.021500
1200.021400
1300.000000
140-0.028500
150-0.003100
1600.017300
170-0.024700
1800.003300
1900.000000
200-0.001400
2100.008000
2200.034300
2300.044600
2400.016400
250-0.015200
2600.016800
2700.042900
2800.031300
2900.006200
3000.043300
3100.029700
3200.001100
3300.027000
340-0.006700
3500.027200
3600.008200
370-0.015800
3800.007200
3900.012100
4000.000000
4100.010500
4200.019800
4300.000800
4400.003400
450-0.007900
460-0.011800
470-0.016300
480-0.002300
490-0.005500
5000.038000

" ], - "source": [ - "import torch\n", - "from transformers import AutoModelForCausalLM, BitsAndBytesConfig\n", - "\n", - "model = AutoModelForCausalLM.from_pretrained(\n", - " model_id,\n", - " attn_implementation=\"sdpa\", # Change to Flash Attention if GPU has support\n", - " dtype=\"float32\", # Change to bfloat16 if GPU has support\n", - " quantization_config=BitsAndBytesConfig(\n", - " load_in_4bit=True, # Load the model in 4-bit precision to save memory\n", - " bnb_4bit_compute_dtype=torch.float16, # Data type used for internal computations in quantization\n", - " bnb_4bit_use_double_quant=True, # Use double quantization to improve accuracy\n", - " bnb_4bit_quant_type=\"nf4\" # Type of quantization. \"nf4\" is recommended for recent LLMs\n", - " )\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AM-G0_QmDyZC" - }, - "source": [ - "The following cell defines LoRA (or QLoRA if needed). When training with LoRA/QLoRA, we use a **base model** (the one selected above) and, instead of modifying its original weights, we fine-tune a **LoRA adapter**, a lightweight layer that enables efficient and memory-friendly training. The **`target_modules`** specify which parts of the model (e.g., attention or projection layers) will be adapted by LoRA during fine-tuning." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WIz2pmX6Wbdp" - }, - "outputs": [], - "source": [ - "from peft import LoraConfig\n", - "\n", - "# You may need to update `target_modules` depending on the architecture of your chosen model.\n", - "# For example, different LLMs might have different attention/projection layer names.\n", - "peft_config = LoraConfig(\n", - " r=32,\n", - " lora_alpha=32,\n", - " target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\",],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "prKnAp-Esyiq" - }, - "source": [ - "## Train model\n", - "\n", - "GRPO requires **reward functions** to guide the learning process. For convenience, we can directly load pre-defined rewards from `trl.rewards`, which already includes a [collection of ready-to-use rewards](https://huggingface.co/docs/trl/rewards).\n", - "\n", - "If you want to create your own custom reward functions to teach the model, a reward function is simply a Python function that takes the generated completions and returns a list of floats. For example, the following function, which we use in this notebook, rewards completions that correctly follow the `` format:\n", - "\n", - "```python\n", - "def think_format_reward(completions: list[list[dict[str, str]]], **kwargs) -> list[float]:\n", - " pattern = r\"^(?!.*)(.*?).*$\"\n", - " completion_contents = [completion[0][\"content\"] for completion in completions]\n", - " matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completion_contents]\n", - " return [1.0 if match else 0.0 for match in matches]\n", - "```\n", - "\n", - "In this notebook, we will use both `think_format_reward`, which rewards completions that correctly follow the `` format, and `reasoning_accuracy_reward`, which evaluates the correctness of the model's solution to the mathematical problem. Together, these rewards guide the model to generate **structured reasoning** while producing **accurate answers**." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lj42Qs5vWbdp" - }, - "outputs": [], - "source": [ - "from trl.rewards import think_format_reward, reasoning_accuracy_reward" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bFgYgxMbtbEZ" - }, - "source": [ - "We'll configure **GRPO** using `GRPOConfig`, keeping the parameters minimal so that the training can run on a free Colab instance. You can adjust these settings if you have access to more resources. For a complete list of available parameters and their descriptions, refer to the [TRL GRPOConfig documentation](https://huggingface.co/docs/trl/grpo_trainer#trl.GRPOConfig).\n", - "\n", - "> 💡 Note: TRL supports using **vLLM** for generation during GRPO training, which can significantly speed up training. However, it increases VRAM usage since a separate vLLM process is active to handle generation. In this notebook, we do not enable vLLM because we are using **QLoRA**, which updates the quantized vLLM model weights at every step. Enabling vLLM in this setup can cause weight precision issues and make convergence more challenging. The configuration includes the vLLM parameters in case you want to experiment with it. Learn more about vLLM integration in TRL [here](https://huggingface.co/docs/trl/main/en/vllm_integration)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JY11EQMhWbdp" - }, - "outputs": [], - "source": [ - "from trl import GRPOConfig\n", - "\n", - "# Configure training arguments using GRPOConfig\n", - "training_args = GRPOConfig(\n", - " # Training schedule / optimization\n", - " learning_rate=2e-5, # Learning rate for the optimizer\n", - " #num_train_epochs=1,\n", - " max_steps=500, # Number of dataset passes. For full trainings, use `num_train_epochs` instead\n", - "\n", - " # Parameters that control GRPO training (you can adapt them)\n", - " per_device_train_batch_size = 8,\n", - " max_completion_length=256, # default: 256 # Max completion length produced during training\n", - " num_generations=8, # default: 8 # Number of generations produced during trainig for comparison\n", - "\n", - " # Optimizations\n", - " optim = \"paged_adamw_8bit\", # Optimizer\n", - " use_liger_kernel=True, # Enable Liger kernel optimizations for faster training\n", - "\n", - " # Parameters related to reporting and saving\n", - " output_dir=output_dir, # Where to save model checkpoints and logs\n", - " logging_steps=10, # Log training metrics every N steps\n", - " report_to=\"trackio\", # Experiment tracking tool\n", - " trackio_space_id=output_dir, # HF Space where the experiment tracking will be saved\n", - " log_completions=False, # Return model completions during training\n", - "\n", - " # Hub integration\n", - " push_to_hub=True, # Automatically push the trained model to the Hugging Face Hub\n", - " # The model will be saved under your Hub account in the repository named `output_dir`\n", - " # vLLM params\n", - " #use_vllm=False, # Activate vLLM training for faster training\n", - " #vllm_mode='colocate',\n", - " #vllm_gpu_memory_utilization=0.1,\n", - " #vllm_enable_sleep_mode=True\n", - ")" + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "markdown", - "metadata": { - "id": "-9LlOAvWFSor" - }, - "source": [ - "Configure the `GRPOTrainer` by passing the previously defined `training_args`. To keep memory usage low, we are not using an evaluation dataset, but you can include one if desired. We also provide the reward functions that were imported earlier to guide the training process." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "iI_E9KCUWbdq" - }, - "outputs": [], - "source": [ - "from trl import GRPOTrainer\n", - "\n", - "trainer = GRPOTrainer(\n", - " model=model,\n", - " reward_funcs=[think_format_reward, reasoning_accuracy_reward],\n", - " args=training_args,\n", - " train_dataset=train_dataset,\n", - " peft_config=peft_config,\n", - ")" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "* Run finished. Uploading logs to Trackio (please wait...)\n" + ] + } + ], + "source": [ + "trainer_stats = trainer.train()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iqAN-XLCGTGW" + }, + "source": [ + "Show memory stats after training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4BeEwp5EWbds", + "outputId": "668b8a2c-2eef-4e34-8d4a-2a43ccbbdc00" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "8dY7bK8FGLhh" - }, - "source": [ - "Show memory stats before training" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "47228.679 seconds used for training.\n", + "787.14 minutes used for training.\n", + "Peak reserved memory = 8.832 GB.\n", + "Peak reserved memory for training = 2.059 GB.\n", + "Peak reserved memory % of max memory = 59.915 %.\n", + "Peak reserved memory for training % of max memory = 13.968 %.\n" + ] + } + ], + "source": [ + "used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n", + "used_memory_for_lora = round(used_memory - start_gpu_memory, 3)\n", + "used_percentage = round(used_memory / max_memory * 100, 3)\n", + "lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)\n", + "\n", + "print(f\"{trainer_stats.metrics['train_runtime']} seconds used for training.\")\n", + "print(f\"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.\")\n", + "print(f\"Peak reserved memory = {used_memory} GB.\")\n", + "print(f\"Peak reserved memory for training = {used_memory_for_lora} GB.\")\n", + "print(f\"Peak reserved memory % of max memory = {used_percentage} %.\")\n", + "print(f\"Peak reserved memory for training % of max memory = {lora_percentage} %.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R8Sd_AqILeYi" + }, + "source": [ + "The training procedure generates both standard training logs and **trackio** logs, which help us monitor the training progress. Example outputs would look like the following:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2bPn6gruLf-n" + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ibO4f7tuLboQ" + }, + "source": [ + "## Saving fine tuned model\n", + "\n", + "In this step, we save the fine-tuned model both **locally** and to the **Hugging Face Hub** using the credentials from your account." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "referenced_widgets": [ + "e6a3677667ce47bcba55e3e950e446f9", + "17adb84604d84cf688a89a21f6cc6150", + "a21c1bbd3cd04738a8c96fbfc0c016c6", + "65cadde3da7642188f029bb2aceaa7c6", + "0404b89e5ce24e76958c72bedc1a95cc", + "c52baf990fde40c0873747e827dc6926", + "191653e8ce184123a68f26fbf2b78745", + "0bb882d400864b249c80132264de2623", + "09cbfcf6e51c431798f4e392a81be6d3", + "d6521f73f23f42e18ee462a547f251a1" + ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PEVRGlrAWbdq", - "outputId": "78fac9e4-4ae6-4836-bd10-c30b39059782" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "GPU = Tesla T4. Max memory = 14.741 GB.\n", - "6.773 GB of memory reserved.\n" - ] - } - ], - "source": [ - "gpu_stats = torch.cuda.get_device_properties(0)\n", - "start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n", - "max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)\n", - "\n", - "print(f\"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.\")\n", - "print(f\"{start_gpu_memory} GB of memory reserved.\")" - ] + "id": "itpVDjy0Wbdt", + "outputId": "b821c7ed-6c9d-440a-a797-e25291627bef" + }, + "outputs": [], + "source": [ + "trainer.save_model(output_dir)\n", + "trainer.push_to_hub(dataset_name=dataset_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "81eBZe-X7daz" + }, + "source": [ + "## Load the fine-tuned model and run inference\n", + "\n", + "Now, let's test our fine-tuned model by loading the **LoRA/QLoRA adapter** and performing **inference**. We'll start by loading the **base model**, then attach the adapter to it, creating the final fine-tuned model ready for evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "referenced_widgets": [ + "1d3fbf86d53845beac599c5b231e87ea" + ] }, + "id": "ZLdaWYzNWbdt", + "outputId": "a103b64b-1f6b-4423-c5fd-402f210e6dc3" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "z-5xPtfIGQL5" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1d3fbf86d53845beac599c5b231e87ea", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "And train!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Training on a T4 in Colab with the configuration defined in this notebook takes around 13 hours. If you're just experimenting, you can try the following quicker task ([source](https://huggingface.co/learn/llm-course/en/chapter12/5)):\n", - "\n", - "```python\n", - "dataset = load_dataset(\"mlabonne/smoltldr\")\n", - "\n", - "# Reward function\n", - "ideal_length = 50\n", - "\n", - "def reward_len(completions, **kwargs):\n", - " return [-abs(ideal_length - len(completion)) for completion in completions]\n", - "```" + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/4 [00:00" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "* Created new run: sergiopaniego-1766143600\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "

\n", - " \n", - " \n", - " [500/500 13:05:04, Epoch 0/1]\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
StepTraining Loss
100.027900
20-0.011600
300.021500
400.033400
500.039400
600.010300
700.048200
800.067300
900.030600
1000.064000
1100.021500
1200.021400
1300.000000
140-0.028500
150-0.003100
1600.017300
170-0.024700
1800.003300
1900.000000
200-0.001400
2100.008000
2200.034300
2300.044600
2400.016400
250-0.015200
2600.016800
2700.042900
2800.031300
2900.006200
3000.043300
3100.029700
3200.001100
3300.027000
340-0.006700
3500.027200
3600.008200
370-0.015800
3800.007200
3900.012100
4000.000000
4100.010500
4200.019800
4300.000800
4400.003400
450-0.007900
460-0.011800
470-0.016300
480-0.002300
490-0.005500
5000.038000

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "* Run finished. Uploading logs to Trackio (please wait...)\n" - ] - } - ], - "source": [ - "trainer_stats = trainer.train()" + "text/plain": [ + "Map: 0%| | 0/1 [00:00 and tags. After closing , the assistant MUST provide the final answer in plain text.',\n", + " 'role': 'system'},\n", + " {'content': \"In 1988, a person's age was equal to the sum of the digits of their birth year. How old was this person?\",\n", + " 'role': 'user'}]" ] - }, + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from datasets import load_dataset\n", + "\n", + "dataset_name = 'AI-MO/NuminaMath-TIR'\n", + "test_dataset = load_dataset(dataset_name, split='test[:1%]')\n", + "test_dataset = test_dataset.map(make_conversation)\n", + "test_dataset = test_dataset.remove_columns(['messages', 'problem'])\n", + "test_dataset[0]['prompt']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CxKyZwG28BYJ" + }, + "source": [ + "Let's first check what's the output for the base model, without the adapter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qTPJY96eWbdt", + "outputId": "ed02acca-e856-44ec-fa20-c32efd81e018" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4BeEwp5EWbds", - "outputId": "668b8a2c-2eef-4e34-8d4a-2a43ccbbdc00" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "47228.679 seconds used for training.\n", - "787.14 minutes used for training.\n", - "Peak reserved memory = 8.832 GB.\n", - "Peak reserved memory for training = 2.059 GB.\n", - "Peak reserved memory % of max memory = 59.915 %.\n", - "Peak reserved memory for training % of max memory = 13.968 %.\n" - ] - } - ], - "source": [ - "used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n", - "used_memory_for_lora = round(used_memory - start_gpu_memory, 3)\n", - "used_percentage = round(used_memory / max_memory * 100, 3)\n", - "lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)\n", - "\n", - "print(f\"{trainer_stats.metrics['train_runtime']} seconds used for training.\")\n", - "print(f\"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.\")\n", - "print(f\"Peak reserved memory = {used_memory} GB.\")\n", - "print(f\"Peak reserved memory for training = {used_memory_for_lora} GB.\")\n", - "print(f\"Peak reserved memory % of max memory = {used_percentage} %.\")\n", - "print(f\"Peak reserved memory for training % of max memory = {lora_percentage} %.\")" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "To solve this problem, let's denote the birth year of the person as \\(Y\\) (where \\(Y\\) is a four-digit number) and their age in 1988 as \\(A\\). According to the given condition, their age in 1988 is equal to the sum of the digits of their birth year. \n", + "\n", + "Since we're looking at the year 1988, the person would be \\(1988 - Y\\) years old in that year. Given the condition:\n", + "\n", + "\\[1988 - Y = \\text{sum of the digits of } Y\\]\n", + "\n", + "Let's break down the possible range for \\(Y\\). Since the person's age must be less than or equal to 100 (as the sum of the digits of any four-digit number cannot exceed 36), \\(Y\\) must be between 1989 and 2088.\n", + "\n", + "We can systematically check each year in this range to find when the condition holds true. However, considering the constraint on age, we can narrow our search significantly. For example, if \\(Y\\) were 1990, the sum of its digits would be 18, which is not a reasonable age. We need\n" + ] + } + ], + "source": [ + "messages = test_dataset[0]['prompt']\n", + "text = tokenizer.apply_chat_template(\n", + " messages, add_generation_prompt=True, tokenize=False\n", + ")\n", + "model_inputs = tokenizer([text], return_tensors=\"pt\").to(base_model.device)\n", + "\n", + "generated_ids = base_model.generate(\n", + " **model_inputs,\n", + " max_new_tokens=256\n", + ")\n", + "output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]\n", + "\n", + "# Decode and extract model response\n", + "generated_text = tokenizer.decode(output_ids, skip_special_tokens=True)\n", + "print(generated_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "V9eoUwQS8SIi" + }, + "source": [ + "The base model neither produced reasoning traces nor provided a correct answer. Let's now load the fine-tuned model and check its performance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "referenced_widgets": [ + "073b351afd264bf0bf23043b37e0d8ce", + "3dee429faf4e40b192cabebfe4bf2245" + ] }, + "id": "CNannsXXWbdt", + "outputId": "fc43a5b9-4ec6-43eb-fc34-f26e92434faf" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "R8Sd_AqILeYi" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "073b351afd264bf0bf23043b37e0d8ce", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "The training procedure generates both standard training logs and **trackio** logs, which help us monitor the training progress. Example outputs would look like the following:" + "text/plain": [ + "adapter_config.json: 0.00B [00:00, ?B/s]" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "markdown", - "metadata": { - "id": "2bPn6gruLf-n" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3dee429faf4e40b192cabebfe4bf2245", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "" + "text/plain": [ + "adapter_model.safetensors: 0%| | 0.00/162M [00:00 I need to find a birth year where the sum of its digits equals the person's age in 1988 \n", + "\n", + "The person would have been born in 1979, since 1+9+7+9 = 26 and 26 is the age in 1988\n", + "\n", + "answer: 26\n" + ] + } + ], + "source": [ + "text = tokenizer.apply_chat_template(\n", + " messages, add_generation_prompt=True, tokenize=False\n", + ")\n", + "model_inputs = tokenizer([text], return_tensors=\"pt\").to(fine_tuned_model.device)\n", + "\n", + "generated_ids = fine_tuned_model.generate(\n", + " **model_inputs,\n", + " max_new_tokens=256\n", + ")\n", + "output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]\n", + "\n", + "# Decode and extract model response\n", + "generated_text = tokenizer.decode(output_ids, skip_special_tokens=True)\n", + "print(generated_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OU-xDHpEEmg9" + }, + "source": [ + "The final answer is correct!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XNtBOpRY8a2O" + }, + "source": [ + "## Inference and Serving with vLLM\n", + "\n", + "You can use Transformer models with **vLLM** to serve them in real-world applications. Learn more [here](https://blog.vllm.ai/2025/04/11/transformers-backend.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nkhu0uY78lV3" + }, + "source": [ + "### Push Merged Model (for LoRA or QLoRA Training)\n", + "\n", + "To serve the model via **vLLM**, the repository must contain the merged model (base model + LoRA adapter). Therefore, you need to upload it first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NF8ZP9Z-Wbdt", + "outputId": "32a5ab71-1f0d-4289-ea12-66f5f75a957b" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "referenced_widgets": [ - "e6a3677667ce47bcba55e3e950e446f9", - "17adb84604d84cf688a89a21f6cc6150", - "a21c1bbd3cd04738a8c96fbfc0c016c6", - "65cadde3da7642188f029bb2aceaa7c6", - "0404b89e5ce24e76958c72bedc1a95cc", - "c52baf990fde40c0873747e827dc6926", - "191653e8ce184123a68f26fbf2b78745", - "0bb882d400864b249c80132264de2623", - "09cbfcf6e51c431798f4e392a81be6d3", - "d6521f73f23f42e18ee462a547f251a1" - ] - }, - "id": "itpVDjy0Wbdt", - "outputId": "b821c7ed-6c9d-440a-a797-e25291627bef" - }, - "outputs": [], - "source": [ - "trainer.save_model(output_dir)\n", - "trainer.push_to_hub(dataset_name=dataset_name)" + "data": { + "text/plain": [ + "('Qwen2-7B-Instruct-GRPO-merged/tokenizer_config.json',\n", + " 'Qwen2-7B-Instruct-GRPO-merged/special_tokens_map.json',\n", + " 'Qwen2-7B-Instruct-GRPO-merged/chat_template.jinja',\n", + " 'Qwen2-7B-Instruct-GRPO-merged/vocab.json',\n", + " 'Qwen2-7B-Instruct-GRPO-merged/merges.txt',\n", + " 'Qwen2-7B-Instruct-GRPO-merged/added_tokens.json',\n", + " 'Qwen2-7B-Instruct-GRPO-merged/tokenizer.json')" ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_merged = fine_tuned_model.merge_and_unload()\n", + "\n", + "save_dir = f\"{output_dir}-merged\"\n", + "\n", + "model_merged.save_pretrained(save_dir)\n", + "tokenizer.save_pretrained(save_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "referenced_widgets": [ + "d1a0574cc20046d5876cf31b21955f8b", + "7cc2f0ef7ad2494cad572cd898095c00", + "475420d92bb54dc08517ffe423b015c3", + "a76231aeae5a49979d1e9075b0b3eefb", + "b4f469f957134ea9b0e28532fe3caaf1", + "637e55736da34f2c9b098222ae07244a", + "8157e521017c450a9d2a9e41611405e9", + "9746ae4ab0574ed186f898dba3b4b197", + "d4b2a8805ec548ea85e0900ff5927574", + "0668cd8597f141e89ef38129c6641c1f" + ] }, + "id": "X5Zci39rWbdt", + "outputId": "ca329f99-dc7b-470c-f5d9-39a3eabcb16d" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "81eBZe-X7daz" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d1a0574cc20046d5876cf31b21955f8b", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "## Load the fine-tuned model and run inference\n", - "\n", - "Now, let's test our fine-tuned model by loading the **LoRA/QLoRA adapter** and performing **inference**. We'll start by loading the **base model**, then attach the adapter to it, creating the final fine-tuned model ready for evaluation." + "text/plain": [ + "Processing Files (0 / 0) : | | 0.00B / 0.00B " ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "referenced_widgets": [ - "1d3fbf86d53845beac599c5b231e87ea" - ] - }, - "id": "ZLdaWYzNWbdt", - "outputId": "a103b64b-1f6b-4423-c5fd-402f210e6dc3" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7cc2f0ef7ad2494cad572cd898095c00", + "version_major": 2, + "version_minor": 0 }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "1d3fbf86d53845beac599c5b231e87ea", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Loading checkpoint shards: 0%| | 0/4 [00:00 and tags. After closing , the assistant MUST provide the final answer in plain text.',\n", - " 'role': 'system'},\n", - " {'content': \"In 1988, a person's age was equal to the sum of the digits of their birth year. How old was this person?\",\n", - " 'role': 'user'}]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from datasets import load_dataset\n", - "\n", - "dataset_name = 'AI-MO/NuminaMath-TIR'\n", - "test_dataset = load_dataset(dataset_name, split='test[:1%]')\n", - "test_dataset = test_dataset.map(make_conversation)\n", - "test_dataset = test_dataset.remove_columns(['messages', 'problem'])\n", - "test_dataset[0]['prompt']" + "text/plain": [ + " ...0003-of-00004.safetensors: 0%| | 611kB / 4.33GB " ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "markdown", - "metadata": { - "id": "CxKyZwG28BYJ" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b4f469f957134ea9b0e28532fe3caaf1", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "Let's first check what's the output for the base model, without the adapter." + "text/plain": [ + " ...0001-of-00004.safetensors: 1%|1 | 50.3MB / 4.88GB " ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "qTPJY96eWbdt", - "outputId": "ed02acca-e856-44ec-fa20-c32efd81e018" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "637e55736da34f2c9b098222ae07244a", + "version_major": 2, + "version_minor": 0 }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "To solve this problem, let's denote the birth year of the person as \\(Y\\) (where \\(Y\\) is a four-digit number) and their age in 1988 as \\(A\\). According to the given condition, their age in 1988 is equal to the sum of the digits of their birth year. \n", - "\n", - "Since we're looking at the year 1988, the person would be \\(1988 - Y\\) years old in that year. Given the condition:\n", - "\n", - "\\[1988 - Y = \\text{sum of the digits of } Y\\]\n", - "\n", - "Let's break down the possible range for \\(Y\\). Since the person's age must be less than or equal to 100 (as the sum of the digits of any four-digit number cannot exceed 36), \\(Y\\) must be between 1989 and 2088.\n", - "\n", - "We can systematically check each year in this range to find when the condition holds true. However, considering the constraint on age, we can narrow our search significantly. For example, if \\(Y\\) were 1990, the sum of its digits would be 18, which is not a reasonable age. We need\n" - ] - } - ], - "source": [ - "messages = test_dataset[0]['prompt']\n", - "text = tokenizer.apply_chat_template(\n", - " messages, add_generation_prompt=True, tokenize=False\n", - ")\n", - "model_inputs = tokenizer([text], return_tensors=\"pt\").to(base_model.device)\n", - "\n", - "generated_ids = base_model.generate(\n", - " **model_inputs,\n", - " max_new_tokens=256\n", - ")\n", - "output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]\n", - "\n", - "# Decode and extract model response\n", - "generated_text = tokenizer.decode(output_ids, skip_special_tokens=True)\n", - "print(generated_text)" + "text/plain": [ + " ...0004-of-00004.safetensors: 4%|3 | 41.9MB / 1.09GB " ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "markdown", - "metadata": { - "id": "V9eoUwQS8SIi" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8157e521017c450a9d2a9e41611405e9", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "The base model neither produced reasoning traces nor provided a correct answer. Let's now load the fine-tuned model and check its performance." + "text/plain": [ + "README.md: 0.00B [00:00, ?B/s]" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "referenced_widgets": [ - "073b351afd264bf0bf23043b37e0d8ce", - "3dee429faf4e40b192cabebfe4bf2245" - ] - }, - "id": "CNannsXXWbdt", - "outputId": "fc43a5b9-4ec6-43eb-fc34-f26e92434faf" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9746ae4ab0574ed186f898dba3b4b197", + "version_major": 2, + "version_minor": 0 }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "073b351afd264bf0bf23043b37e0d8ce", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "adapter_config.json: 0.00B [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3dee429faf4e40b192cabebfe4bf2245", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "adapter_model.safetensors: 0%| | 0.00/162M [00:00 I need to find a birth year where the sum of its digits equals the person's age in 1988 \n", - "\n", - "The person would have been born in 1979, since 1+9+7+9 = 26 and 26 is the age in 1988\n", - "\n", - "answer: 26\n" - ] - } - ], - "source": [ - "text = tokenizer.apply_chat_template(\n", - " messages, add_generation_prompt=True, tokenize=False\n", - ")\n", - "model_inputs = tokenizer([text], return_tensors=\"pt\").to(fine_tuned_model.device)\n", - "\n", - "generated_ids = fine_tuned_model.generate(\n", - " **model_inputs,\n", - " max_new_tokens=256\n", - ")\n", - "output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]\n", - "\n", - "# Decode and extract model response\n", - "generated_text = tokenizer.decode(output_ids, skip_special_tokens=True)\n", - "print(generated_text)" + "text/plain": [ + "New Data Upload : | | 0.00B / 0.00B " ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "markdown", - "metadata": { - "id": "OU-xDHpEEmg9" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0668cd8597f141e89ef38129c6641c1f", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "The final answer is correct!" + "text/plain": [ + " ...RPO-merged/tokenizer.json: 100%|##########| 11.4MB / 11.4MB " ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "markdown", - "metadata": { - "id": "XNtBOpRY8a2O" + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" }, - "source": [ - "## Inference and Serving with vLLM\n", - "\n", - "You can use Transformer models with **vLLM** to serve them in real-world applications. Learn more [here](https://blog.vllm.ai/2025/04/11/transformers-backend.html)." + "text/plain": [ + "CommitInfo(commit_url='https://huggingface.co/sergiopaniego/Qwen2-7B-Instruct-GRPO-merged/commit/b20988444532e79a6915f0b2b6002b5acc2b53e1', commit_message='Upload tokenizer', commit_description='', oid='b20988444532e79a6915f0b2b6002b5acc2b53e1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sergiopaniego/Qwen2-7B-Instruct-GRPO-merged', endpoint='https://huggingface.co', repo_type='model', repo_id='sergiopaniego/Qwen2-7B-Instruct-GRPO-merged'), pr_revision=None, pr_num=None)" ] - }, + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_merged.push_to_hub(f\"sergiopaniego/{output_dir}-merged\") # Replace with your HF username or organization\n", + "tokenizer.push_to_hub(f\"sergiopaniego/{output_dir}-merged\") # Replace with your HF username or organization" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DQ00Ivxi8rFu" + }, + "source": [ + "### Performing Inference with vLLM\n", + "\n", + "Use **vLLM** to run your model and generate text efficiently in real-time. This allows you to test and deploy your fine-tuned models with low latency and high throughput." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "x7L-HIn4Wbdt", + "outputId": "afd66093-3525-4590-f834-c0b373e7bb9e" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "nkhu0uY78lV3" - }, - "source": [ - "### Push Merged Model (for LoRA or QLoRA Training)\n", - "\n", - "To serve the model via **vLLM**, the repository must contain the merged model (base model + LoRA adapter). Therefore, you need to upload it first." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO 12-11 15:56:09 [utils.py:253] non-default args: {'dtype': torch.float16, 'max_model_len': 256, 'disable_log_stats': True, 'model_impl': 'transformers', 'model': 'sergiopaniego/Qwen2-7B-Instruct-GRPO-merged'}\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NF8ZP9Z-Wbdt", - "outputId": "32a5ab71-1f0d-4289-ea12-66f5f75a957b" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "('Qwen2-7B-Instruct-GRPO-merged/tokenizer_config.json',\n", - " 'Qwen2-7B-Instruct-GRPO-merged/special_tokens_map.json',\n", - " 'Qwen2-7B-Instruct-GRPO-merged/chat_template.jinja',\n", - " 'Qwen2-7B-Instruct-GRPO-merged/vocab.json',\n", - " 'Qwen2-7B-Instruct-GRPO-merged/merges.txt',\n", - " 'Qwen2-7B-Instruct-GRPO-merged/added_tokens.json',\n", - " 'Qwen2-7B-Instruct-GRPO-merged/tokenizer.json')" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model_merged = fine_tuned_model.merge_and_unload()\n", - "\n", - "save_dir = f\"{output_dir}-merged\"\n", - "\n", - "model_merged.save_pretrained(save_dir)\n", - "tokenizer.save_pretrained(save_dir)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:104: UserWarning: \n", + "Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.\n", + "You are not authenticated with the Hugging Face Hub in this notebook.\n", + "If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).\n", + " warnings.warn(\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "referenced_widgets": [ - "d1a0574cc20046d5876cf31b21955f8b", - "7cc2f0ef7ad2494cad572cd898095c00", - "475420d92bb54dc08517ffe423b015c3", - "a76231aeae5a49979d1e9075b0b3eefb", - "b4f469f957134ea9b0e28532fe3caaf1", - "637e55736da34f2c9b098222ae07244a", - "8157e521017c450a9d2a9e41611405e9", - "9746ae4ab0574ed186f898dba3b4b197", - "d4b2a8805ec548ea85e0900ff5927574", - "0668cd8597f141e89ef38129c6641c1f" - ] - }, - "id": "X5Zci39rWbdt", - "outputId": "ca329f99-dc7b-470c-f5d9-39a3eabcb16d" - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d1a0574cc20046d5876cf31b21955f8b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Processing Files (0 / 0) : | | 0.00B / 0.00B " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "7cc2f0ef7ad2494cad572cd898095c00", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "New Data Upload : | | 0.00B / 0.00B " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "475420d92bb54dc08517ffe423b015c3", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " ...0002-of-00004.safetensors: 0%| | 612kB / 4.93GB " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a76231aeae5a49979d1e9075b0b3eefb", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " ...0003-of-00004.safetensors: 0%| | 611kB / 4.33GB " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b4f469f957134ea9b0e28532fe3caaf1", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " ...0001-of-00004.safetensors: 1%|1 | 50.3MB / 4.88GB " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "637e55736da34f2c9b098222ae07244a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " ...0004-of-00004.safetensors: 4%|3 | 41.9MB / 1.09GB " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8157e521017c450a9d2a9e41611405e9", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "README.md: 0.00B [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9746ae4ab0574ed186f898dba3b4b197", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Processing Files (0 / 0) : | | 0.00B / 0.00B " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d4b2a8805ec548ea85e0900ff5927574", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "New Data Upload : | | 0.00B / 0.00B " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0668cd8597f141e89ef38129c6641c1f", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " ...RPO-merged/tokenizer.json: 100%|##########| 11.4MB / 11.4MB " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - }, - "text/plain": [ - "CommitInfo(commit_url='https://huggingface.co/sergiopaniego/Qwen2-7B-Instruct-GRPO-merged/commit/b20988444532e79a6915f0b2b6002b5acc2b53e1', commit_message='Upload tokenizer', commit_description='', oid='b20988444532e79a6915f0b2b6002b5acc2b53e1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sergiopaniego/Qwen2-7B-Instruct-GRPO-merged', endpoint='https://huggingface.co', repo_type='model', repo_id='sergiopaniego/Qwen2-7B-Instruct-GRPO-merged'), pr_revision=None, pr_num=None)" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model_merged.push_to_hub(f\"sergiopaniego/{output_dir}-merged\") # Replace with your HF username or organization\n", - "tokenizer.push_to_hub(f\"sergiopaniego/{output_dir}-merged\") # Replace with your HF username or organization" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO 12-11 15:56:37 [model.py:631] Resolved architecture: TransformersForCausalLM\n", + "WARNING 12-11 15:56:37 [model.py:1971] Casting torch.bfloat16 to torch.float16.\n", + "INFO 12-11 15:56:37 [model.py:1745] Using max model len 256\n", + "INFO 12-11 15:56:40 [scheduler.py:216] Chunked prefill is enabled with max_num_batched_tokens=8192.\n", + "WARNING 12-11 15:56:43 [system_utils.py:103] We must use the `spawn` multiprocessing start method. Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing for more information. Reasons: CUDA is initialized\n", + "INFO 12-11 15:57:36 [llm.py:352] Supported tasks: ['generate']\n" + ] + } + ], + "source": [ + "from vllm import LLM, SamplingParams\n", + "from transformers import AutoTokenizer\n", + "import torch\n", + "\n", + "llm = LLM(\n", + " model=f\"sergiopaniego/{output_dir}-merged\", # Replace with your HF username or organization\n", + " model_impl=\"transformers\", # Select the transformers model implementation\n", + " max_model_len=256, # Reduced for efficiency\n", + " dtype=torch.float16\n", + ")\n", + "hf_tokenizer = AutoTokenizer.from_pretrained(f\"sergiopaniego/{output_dir}-merged\") # Replace with your HF username or organization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "referenced_widgets": [ + "f0a4f4fb17bf4a698503212296467547", + "5be7348f3f324b5b9397c9ad186fb35d" + ] }, + "id": "ZTpSUqxNWbdt", + "outputId": "6a9283bf-d3b7-4e54-c775-4502694b5c6d" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "DQ00Ivxi8rFu" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f0a4f4fb17bf4a698503212296467547", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "### Performing Inference with vLLM\n", - "\n", - "Use **vLLM** to run your model and generate text efficiently in real-time. This allows you to test and deploy your fine-tuned models with low latency and high throughput." + "text/plain": [ + "Adding requests: 0%| | 0/1 [00:00 1988 birth year implies the person was born either in 1979, 1980, 1981, etc. Looking for the one where sum of digits equals age \n", - "\n", - "The birth year 1979 gives sum of digits 1+9+7+9 = 26\n", - "\n", - "The person was 26 years old in 1988.\n", - "\n", - "Answer: The person was 26 years old.\n" - ] - } - ], - "source": [ - "messages = test_dataset[0]['prompt']\n", - "# Alternatively, use llm.chat()\n", - "prompt = hf_tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)\n", - "\n", - "outputs = llm.generate(\n", - " {\"prompt\": prompt},\n", - " sampling_params=SamplingParams(max_tokens=256),\n", - ")\n", - "\n", - "for o in outputs:\n", - " generated_text = o.outputs[0].text\n", - " print(generated_text)" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "T4", - "provenance": [] - }, - "language_info": { - "name": "python" + "name": "stdout", + "output_type": "stream", + "text": [ + " 1988 birth year implies the person was born either in 1979, 1980, 1981, etc. Looking for the one where sum of digits equals age \n", + "\n", + "The birth year 1979 gives sum of digits 1+9+7+9 = 26\n", + "\n", + "The person was 26 years old in 1988.\n", + "\n", + "Answer: The person was 26 years old.\n" + ] } + ], + "source": [ + "messages = test_dataset[0]['prompt']\n", + "# Alternatively, use llm.chat()\n", + "prompt = hf_tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)\n", + "\n", + "outputs = llm.generate(\n", + " {\"prompt\": prompt},\n", + " sampling_params=SamplingParams(max_tokens=256),\n", + ")\n", + "\n", + "for o in outputs:\n", + " generated_text = o.outputs[0].text\n", + " print(generated_text)" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/examples/notebooks/sft_trl_lora_qlora.ipynb b/examples/notebooks/sft_trl_lora_qlora.ipynb index f6aeb420f31..4f4933036b4 100644 --- a/examples/notebooks/sft_trl_lora_qlora.ipynb +++ b/examples/notebooks/sft_trl_lora_qlora.ipynb @@ -277,7 +277,7 @@ "source": [ "## Load model and configure LoRA/QLoRA\n", "\n", - "This notebook can be used with two fine-tuning methods. By default, it is set up for **QLoRA**, which includes quantization using `BitsAndBytesConfig`. If you prefer to use standard **LoRA** without quantization, simply comment out the `BitsAndBytesConfig` configuration.\n", + "This notebook can be used with two fine-tuning methods. By default, it is set up for **QLoRA**, which includes quantization using `BitsAndBytesConfig`. If you prefer to use standard **LoRA** without quantization, simply set `quantization_config = None` below.\n", "\n", "Below, choose your **preferred model**. All of the options have been tested on **free Colab instances**." ] @@ -317,7 +317,7 @@ "id": "BXY9Y0_dLWAf" }, "source": [ - "Let's load the selected model using `transformers`, configuring QLoRA via `bitsandbytes` (you can remove it if doing LoRA). We don't need to configure the tokenizer since the trainer takes care of that automatically." + "Let's configure **QLoRA** by defining a `BitsAndBytesConfig` (set `quantization_config = None` for plain LoRA). We pass the model id and this config directly to the trainer, which loads and quantizes the model for us. We don't need to configure the tokenizer since the trainer takes care of that automatically." ] }, { @@ -329,19 +329,14 @@ "outputs": [], "source": [ "import torch\n", - "from transformers import AutoModelForCausalLM, BitsAndBytesConfig\n", + "from transformers import BitsAndBytesConfig\n", "\n", - "model = AutoModelForCausalLM.from_pretrained(\n", - " model_id,\n", - " attn_implementation=\"sdpa\", # Change to Flash Attention if GPU has support\n", - " dtype=torch.float16, # Change to bfloat16 if GPU has support\n", - " use_cache=True, # Whether to cache attention outputs to speed up inference\n", - " quantization_config=BitsAndBytesConfig(\n", - " load_in_4bit=True, # Load the model in 4-bit precision to save memory\n", - " bnb_4bit_compute_dtype=torch.float16, # Data type used for internal computations in quantization\n", - " bnb_4bit_use_double_quant=True, # Use double quantization to improve accuracy\n", - " bnb_4bit_quant_type=\"nf4\" # Type of quantization. \"nf4\" is recommended for recent LLMs\n", - " )\n", + "# QLoRA: 4-bit quantization config passed to the trainer (set to None for plain LoRA).\n", + "quantization_config = BitsAndBytesConfig(\n", + " load_in_4bit=True, # Load the model in 4-bit precision to save memory\n", + " bnb_4bit_compute_dtype=torch.float16, # Data type used for internal computations in quantization\n", + " bnb_4bit_use_double_quant=True, # Use double quantization to improve accuracy\n", + " bnb_4bit_quant_type=\"nf4\", # Type of quantization. \"nf4\" is recommended for recent LLMs\n", ")" ] }, @@ -395,6 +390,13 @@ "from trl import SFTConfig\n", "\n", "training_args = SFTConfig(\n", + " # Model loading (passed to `from_pretrained` when the trainer loads the model)\n", + " model_init_kwargs={\n", + " \"attn_implementation\": \"sdpa\", # Change to Flash Attention if GPU has support\n", + " \"dtype\": \"float16\", # Change to bfloat16 if GPU has support\n", + " \"use_cache\": True, # Whether to cache attention outputs to speed up inference\n", + " },\n", + "\n", " # Training schedule / optimization\n", " per_device_train_batch_size = 1, # Batch size per GPU\n", " gradient_accumulation_steps = 4, # Gradients are accumulated over multiple steps → effective batch size = 2 * 8 = 16\n", @@ -441,10 +443,11 @@ "from trl import SFTTrainer\n", "\n", "trainer = SFTTrainer(\n", - " model=model,\n", + " model=model_id,\n", " args=training_args,\n", " train_dataset=train_dataset,\n", - " peft_config=peft_config\n", + " quantization_config=quantization_config,\n", + " peft_config=peft_config,\n", ")" ] }, @@ -1055,14 +1058,14 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "0C8MhsSoLWAx", - "outputId": "22af8503-64ac-42d5-f134-1d1dc68199e9", "colab": { "referenced_widgets": [ "196152bc32a74b9994f55f483ce85dea", "a72d3a3407944729b65be313a47d558f" ] - } + }, + "id": "0C8MhsSoLWAx", + "outputId": "22af8503-64ac-42d5-f134-1d1dc68199e9" }, "outputs": [ { @@ -1122,18 +1125,18 @@ } ], "metadata": { + "accelerator": "GPU", "colab": { - "provenance": [], - "gpuType": "T4" - }, - "language_info": { - "name": "python" + "gpuType": "T4", + "provenance": [] }, "kernelspec": { - "name": "python3", - "display_name": "Python 3" + "display_name": "Python 3", + "name": "python3" }, - "accelerator": "GPU" + "language_info": { + "name": "python" + } }, "nbformat": 4, "nbformat_minor": 0