From cb328f2d4125c6eae5079a1baa96b69a0d972f1b Mon Sep 17 00:00:00 2001 From: KaraKaraWitch <134394640+KaraKaraWitch@users.noreply.github.com> Date: Tue, 17 Feb 2026 14:56:27 +0800 Subject: [PATCH] Constrain-able Abliteration: Move hardcoded layer indices (0.4-0.9) and component weights (0.8-1.5) into a new `[optimization_constraints]` section in `Settings`. Enables restricting of search space for specific architectures (e.g., Llama-3.3) where MLP intervention causes knowledge degradation, without modifying source code. Defaults remain unchanged to preserve backward compatibility. Signed-off-by: KaraKaraWitch <134394640+KaraKaraWitch@users.noreply.github.com> --- config.default.toml | 18 ++++++++++++++++++ src/heretic/config.py | 35 +++++++++++++++++++++++++++++++++++ src/heretic/main.py | 13 +++++++++---- 3 files changed, 62 insertions(+), 4 deletions(-) diff --git a/config.default.toml b/config.default.toml index abfa0fc7..35cf5bb7 100644 --- a/config.default.toml +++ b/config.default.toml @@ -91,6 +91,24 @@ n_trials = 200 # Number of trials that use random sampling for the purpose of exploration. n_startup_trials = 60 +# Constraints for the optimization search space (layers and weights). +[constraints] +# Fraction of layers (from 0.0 to 1.0) where the direction search starts. +layer_start_fraction = 0.4 +# Fraction of layers (from 0.0 to 1.0) where the direction search ends. +layer_end_fraction = 0.9 + +# Search constraints for Attention components (e.g., o_proj). +[constraints.attention] +max_weight_min = 0.8 +max_weight_max = 1.5 + +# Search constraints for MLP components (e.g., down_proj). +[constraints.mlp] +max_weight_min = 0.8 +max_weight_max = 1.5 + + # Directory to save and load study progress to/from. study_checkpoint_dir = "checkpoints" diff --git a/src/heretic/config.py b/src/heretic/config.py index 8ed3f80c..c7dc0a1f 100644 --- a/src/heretic/config.py +++ b/src/heretic/config.py @@ -61,6 +61,36 @@ class DatasetSpecification(BaseModel): ) +class ComponentConstraints(BaseModel): + max_weight_min: float = Field( + default=0.8, + description="Minimum value for the max_weight parameter search range.", + ) + max_weight_max: float = Field( + default=1.5, + description="Maximum value for the max_weight parameter search range.", + ) + + +class OptimizationConstraints(BaseModel): + layer_start_fraction: float = Field( + default=0.4, + description="Fraction of layers (from 0.0 to 1.0) where the direction search starts.", + ) + layer_end_fraction: float = Field( + default=0.9, + description="Fraction of layers (from 0.0 to 1.0) where the direction search ends.", + ) + attention: ComponentConstraints = Field( + default_factory=ComponentConstraints, + description="Search constraints for Attention components (e.g., o_proj).", + ) + mlp: ComponentConstraints = Field( + default_factory=ComponentConstraints, + description="Search constraints for MLP components (e.g., down_proj).", + ) + + class Settings(BaseSettings): model: str = Field(description="Hugging Face model ID, or path to model on disk.") @@ -225,6 +255,11 @@ class Settings(BaseSettings): description="Number of trials that use random sampling for the purpose of exploration.", ) + constraints: OptimizationConstraints = Field( + default_factory=OptimizationConstraints, + description="Constraints for the optimization search space (layers and weights).", + ) + study_checkpoint_dir: str = Field( default="checkpoints", description="Directory to save and load study progress to/from.", diff --git a/src/heretic/main.py b/src/heretic/main.py index 016c3920..bdb916f0 100644 --- a/src/heretic/main.py +++ b/src/heretic/main.py @@ -474,8 +474,8 @@ def objective(trial: Trial) -> tuple[float, float]: # work with conditional or variable-range parameters. direction_index = trial.suggest_float( "direction_index", - 0.4 * last_layer_index, - 0.9 * last_layer_index, + settings.constraints.layer_start_fraction * last_layer_index, + settings.constraints.layer_end_fraction * last_layer_index, ) if direction_scope == "per layer": @@ -487,10 +487,15 @@ def objective(trial: Trial) -> tuple[float, float]: # The parameter ranges are based on experiments with various models # and much wider ranges. They are not set in stone and might have to be # adjusted for future models. + + if "down_proj" in component: + constraints = settings.constraints.mlp + else: + constraints = settings.constraints.attention max_weight = trial.suggest_float( f"{component}.max_weight", - 0.8, - 1.5, + constraints.max_weight_min, + constraints.max_weight_max, ) max_weight_position = trial.suggest_float( f"{component}.max_weight_position",