p-e-w · p-e-w · May 31, 2026 · May 31, 2026 · May 31, 2026 · May 31, 2026
diff --git a/config.nohumor.toml b/config.nohumor.toml
@@ -0,0 +1,78 @@
+# Rename this file to config.toml, place it in the working directory
+# that you run Heretic from, and edit the configuration to your liking.
+
+# This config targets the "humor axis": it treats humorous output as the
+# behavior to ablate away. After processing, the model should respond to
+# the same prompts in a more serious, deadpan register.
+#
+# good_prompts/good_evaluation_prompts (non-humorous baseline) use the
+# default mlabonne/harmless_alpaca splits. bad_prompts/bad_evaluation_prompts
+# use UnstableLlama/jokes, a small dataset of joke-eliciting prompts,
+# split 4:1 for training vs. evaluation.
+
+max_response_length = 300
+
+residual_plot_title = "PaCMAP Projection of Residuals for Serious/Humorous Prompts"
+
+refusal_markers = [
+    "😅",
+    "here's one",
+    "why did",
+    "walked into a bar",
+    "gets worse",
+    "the punchline",
+    "classic",
+    "dad joke",
+    "bad joke",
+    "pun intended",
+    "ba dum tss",
+    "rimshot",
+    "😂",
+    "😄",
+    "😆",
+    "haha",
+    "hehe",
+    "lol",
+    "funny",
+    "joke",
+    "humor",
+    "that's hilarious",
+    "you could say",
+    "one-liner",
+    "comedian",
+    "stand-up",
+    "unexpectedly",
+    "because apparently",
+    "to be fair",
+    "on the bright side",
+    "lmao",
+    "omg",
+    "rofl",
+    "silly",
+    "humorous",
+    "clever",
+]
+
+[good_prompts]
+dataset = "mlabonne/harmless_alpaca"
+split = "train[:400]"
+column = "text"
+residual_plot_label = "Serious prompts"
+residual_plot_color = "royalblue"
+
+[bad_prompts]
+dataset = "UnstableLlama/jokes"
+split = "train[:200]"
+column = "text"
+residual_plot_label = "Humorous prompts"
+residual_plot_color = "darkorange"
+
+[good_evaluation_prompts]
+dataset = "mlabonne/harmless_alpaca"
+split = "test[:100]"
+column = "text"
+
+[bad_evaluation_prompts]
+dataset = "UnstableLlama/jokes"
+split = "train[200:250]"
+column = "text"