From 422e323fb405971a28b2761539eeea0ecdc1867c Mon Sep 17 00:00:00 2001 From: hugo Date: Tue, 26 May 2026 19:59:10 +0000 Subject: [PATCH 01/11] saturn-python-llm: declare trl, peft, datasets explicitly These are pulled in transitively by unsloth, but declaring them explicitly makes the image's training API surface stable against unsloth version bumps. The Token Factory fine-tune training script (in a separate repo) imports trl.SFTTrainer and peft directly, and needs to be able to rely on those being present and version-compatible with the rest of the image's HF stack. Co-Authored-By: Claude Opus 4.7 (1M context) --- saturn-python-llm/environment.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/saturn-python-llm/environment.yml b/saturn-python-llm/environment.yml index 6c1acda..92afeca 100644 --- a/saturn-python-llm/environment.yml +++ b/saturn-python-llm/environment.yml @@ -32,7 +32,14 @@ dependencies: - ipykernel - pip - pip: + # Fine-tuning stack. unsloth pulls trl/peft/datasets transitively, but we + # declare them explicitly so the image's training API surface is stable + # against unsloth version bumps. The Token Factory fine-tune training + # script (separate repo) imports trl.SFTTrainer + peft directly. - unsloth + - trl + - peft + - datasets - vllm - ray - sentence-transformers From e368c795e9501b70767455e931c30de86e53addf Mon Sep 17 00:00:00 2001 From: hugo Date: Wed, 27 May 2026 17:42:32 +0000 Subject: [PATCH 02/11] Fix recipe-template field names to match ImageSpecSchema MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five recipe-template.json files were using bogus field names (recipeName/image/gpu/saturnVersion) introduced when the 12.4 images were added in Aug 2025 and propagated forward to 12.9 and the AMD image. The release-images builder uploads these as-is to S3 with schema_version 2022.03.01, which causes the legacy pre_load in saturn's BaseRecipeSchema to wrap them under "spec" — at which point they fail ImageSpecSchema validation (Missing required field "name"; Unknown fields recipeName/gpu/image/saturnVersion). Switch all five templates to the established shape (name/description/hardware_type/supports). The AMD template uses hardware_type=AMD; the others use gpu/cpu as appropriate, matching the existing CUDA 11.8 / 12.1 templates. --- saturn-python-312-slim-gpu-12.9/recipe-template.json | 11 +++++------ saturn-python-312-slim/recipe-template.json | 11 +++++------ saturnbase-python-amd-gpu-devel/recipe-template.json | 11 +++++------ saturnbase-python-gpu-12.9/recipe-template.json | 11 +++++------ saturnbase-python-gpu-devel-12.9/recipe-template.json | 11 +++++------ 5 files changed, 25 insertions(+), 30 deletions(-) diff --git a/saturn-python-312-slim-gpu-12.9/recipe-template.json b/saturn-python-312-slim-gpu-12.9/recipe-template.json index dacfbea..892a292 100644 --- a/saturn-python-312-slim-gpu-12.9/recipe-template.json +++ b/saturn-python-312-slim-gpu-12.9/recipe-template.json @@ -1,7 +1,6 @@ { - "recipeName": "saturn-python-312-slim-gpu-12.9", - "description": "Python 3.12 GPU slim image with CUDA 12.9 and minimal packages", - "image": "saturncloud/saturn-python-slim-gpu:2025.05.01-cuda129-python312", - "gpu": true, - "saturnVersion": "2025.05.01" -} \ No newline at end of file + "name": "saturn-python-312-slim-gpu-12.9", + "description": "Python 3.12 GPU slim image with CUDA 12.9 and minimal packages.", + "hardware_type": "gpu", + "supports": ["jupyterlab", "dask"] +} diff --git a/saturn-python-312-slim/recipe-template.json b/saturn-python-312-slim/recipe-template.json index 6c6380e..ade91e8 100644 --- a/saturn-python-312-slim/recipe-template.json +++ b/saturn-python-312-slim/recipe-template.json @@ -1,7 +1,6 @@ { - "recipeName": "saturn-python-312-slim", - "description": "Python 3.12 slim image with minimal packages", - "image": "saturncloud/saturn-python-slim:2025.05.01-python312", - "gpu": false, - "saturnVersion": "2025.05.01" -} \ No newline at end of file + "name": "saturn-python-312-slim", + "description": "Python 3.12 slim image with minimal packages.", + "hardware_type": "cpu", + "supports": ["jupyterlab", "dask"] +} diff --git a/saturnbase-python-amd-gpu-devel/recipe-template.json b/saturnbase-python-amd-gpu-devel/recipe-template.json index e2e852e..e8e21fb 100644 --- a/saturnbase-python-amd-gpu-devel/recipe-template.json +++ b/saturnbase-python-amd-gpu-devel/recipe-template.json @@ -1,7 +1,6 @@ { - "recipeName": "saturnbase-python-amd-gpu-devel", - "description": "Saturn base Python GPU devel image with rocm7 development tools", - "image": "saturncloud/saturnbase-python-amd-gpu-devel:2025.05.01", - "gpu": true, - "saturnVersion": "2025.05.01" -} \ No newline at end of file + "name": "saturnbase-python-amd-gpu-devel", + "description": "Saturn base Python AMD GPU devel image with ROCm 7 development tools.", + "hardware_type": "AMD", + "supports": ["jupyterlab", "dask"] +} diff --git a/saturnbase-python-gpu-12.9/recipe-template.json b/saturnbase-python-gpu-12.9/recipe-template.json index d24f9a5..d37c424 100644 --- a/saturnbase-python-gpu-12.9/recipe-template.json +++ b/saturnbase-python-gpu-12.9/recipe-template.json @@ -1,7 +1,6 @@ { - "recipeName": "saturnbase-python-gpu-12.9", - "description": "Saturn base Python GPU image with CUDA 12.9", - "image": "saturncloud/saturnbase-python-gpu-12.9:2025.05.01", - "gpu": true, - "saturnVersion": "2025.05.01" -} \ No newline at end of file + "name": "saturnbase-python-gpu-12.9", + "description": "Python-focused base image for Saturn GPU images, built on CUDA version 12.9. This image contains the minimal install required for the full functionality of Saturn Cloud on a GPU instance, including packages necessary to run Python, JupyterLab, and Dask.", + "hardware_type": "gpu", + "supports": ["jupyterlab", "dask"] +} diff --git a/saturnbase-python-gpu-devel-12.9/recipe-template.json b/saturnbase-python-gpu-devel-12.9/recipe-template.json index ed969d0..a6bcd66 100644 --- a/saturnbase-python-gpu-devel-12.9/recipe-template.json +++ b/saturnbase-python-gpu-devel-12.9/recipe-template.json @@ -1,7 +1,6 @@ { - "recipeName": "saturnbase-python-gpu-devel-12.9", - "description": "Saturn base Python GPU devel image with CUDA 12.9 development tools", - "image": "saturncloud/saturnbase-python-gpu-devel-12.9:2025.05.01", - "gpu": true, - "saturnVersion": "2025.05.01" -} \ No newline at end of file + "name": "saturnbase-python-gpu-devel-12.9", + "description": "Python-focused base image for Saturn GPU images, built on CUDA version 12.9 with development tools. This image contains the minimal install required for the full functionality of Saturn Cloud on a GPU instance, including packages necessary to run Python, JupyterLab, and Dask.", + "hardware_type": "gpu", + "supports": ["jupyterlab", "dask"] +} From 2421c3692246faa5373d12249021f21e5e567a4a Mon Sep 17 00:00:00 2001 From: hugo Date: Wed, 27 May 2026 22:33:55 +0000 Subject: [PATCH 03/11] saturn-python-llm: add axolotl 0.16.1 for Token Factory The Token Factory fine-tuning service wraps axolotl: an in-pod shim invokes `axolotl train ` via subprocess, so the binary needs to be on the image's Python path. Pinned exactly to 0.16.1 because Atlas's config renderer is keyed to specific axolotl YAML field names that change across versions; a loose pin or unpinned spec would silently break rendered configs on the next axolotl release. Context: saturncloud/saturn#6394. Co-Authored-By: Claude Opus 4.7 (1M context) --- saturn-python-llm/environment.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/saturn-python-llm/environment.yml b/saturn-python-llm/environment.yml index 92afeca..2b31677 100644 --- a/saturn-python-llm/environment.yml +++ b/saturn-python-llm/environment.yml @@ -40,6 +40,9 @@ dependencies: - trl - peft - datasets + # Pinned exactly: Token Factory's Atlas renderer is keyed to specific + # axolotl YAML field names that change across versions. + - axolotl==0.16.1 - vllm - ray - sentence-transformers From 1e970bc3f7cc19ca7308412d2ee26a13361b3e02 Mon Sep 17 00:00:00 2001 From: hugo Date: Thu, 28 May 2026 13:28:05 +0000 Subject: [PATCH 04/11] Pin python=3.13 in pytorch and tensorflow envs Without an explicit python constraint, mamba now resolves python=3.14 on conda-forge. That breaks both envs on release-2026.05.01: - saturn-python-tensorflow: pip can't find tensorflow[and-cuda] for cp314 (no wheels yet) and snowflake-connector-python resolves to a cp314 wheel, so the pip step in `mamba env update` fails. - saturn-python-pytorch: the multi-channel solve (pytorch + rapidsai + nvidia + conda-forge) blows up with "queue count overflow" and SIGABRTs after ~2.5h. Pinning python=3.13 keeps us close to the latest while staying on a version everything in the env list ships wheels/builds for. We can revisit 3.14 once tensorflow et al catch up. Co-Authored-By: Claude Opus 4.7 (1M context) --- saturn-python-pytorch/environment.yml | 2 +- saturn-python-tensorflow/environment.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/saturn-python-pytorch/environment.yml b/saturn-python-pytorch/environment.yml index b9cf4b8..5186efc 100644 --- a/saturn-python-pytorch/environment.yml +++ b/saturn-python-pytorch/environment.yml @@ -23,7 +23,7 @@ dependencies: - py-opencv - pyarrow - python-graphviz - - python + - python=3.13 - pytorch::pytorch - s3fs - setuptools diff --git a/saturn-python-tensorflow/environment.yml b/saturn-python-tensorflow/environment.yml index eecf295..612a7ac 100644 --- a/saturn-python-tensorflow/environment.yml +++ b/saturn-python-tensorflow/environment.yml @@ -17,7 +17,7 @@ dependencies: - pip - prefect - pyarrow - - python + - python=3.13 - python-graphviz - s3fs - setuptools From 551d3391fae7410c4959b3c50fcea69e53fbfc3d Mon Sep 17 00:00:00 2001 From: hugo Date: Thu, 28 May 2026 15:28:06 +0000 Subject: [PATCH 05/11] saturn-python-pytorch: move torch to PyPI cu129, drop pytorch conda channel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pytorch conda channel is frozen at 2.5.1 (Oct 2024 — the team announced 2.5 as the last release on the channel) and has no cp313 builds. With the python=3.13 pin we landed in #471, the conda solve for `pytorch::pytorch` against the frozen channel has nothing to resolve. The rapidsai + pytorch + nvidia + conda-forge channel mix was also what caused the previous solve to hit `queue count overflow` and SIGABRT after 2h41m. Switch torch / torchvision / torchaudio to PyPI cu129 wheels — pip installs them after the conda env update, so we get the modern PyTorch 2.11 + CUDA 12.9 stack against the matching gpu-12.9 base image. Drop the pytorch + nvidia + rapidsai conda channels and the pytorch::* deps. Drop dask-cuda along with rapidsai (it's the only thing here that needed that channel). Pairs with a release-images change to point the saturn-python-pytorch build at saturnbase-python-gpu-12.9 instead of saturnbase-python-gpu-12.1. Co-Authored-By: Claude Opus 4.7 (1M context) --- saturn-python-pytorch/environment.yml | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/saturn-python-pytorch/environment.yml b/saturn-python-pytorch/environment.yml index 5186efc..c469d8b 100644 --- a/saturn-python-pytorch/environment.yml +++ b/saturn-python-pytorch/environment.yml @@ -1,15 +1,11 @@ name: saturn channels: - - pytorch - - rapidsai - - nvidia - - nodefaults - conda-forge + - nodefaults dependencies: + - python=3.13 - blas=*=mkl - bokeh - - pytorch::pytorch-cuda=12.1 - - dask-cuda - dask - fastai - fsspec @@ -23,15 +19,16 @@ dependencies: - py-opencv - pyarrow - python-graphviz - - python=3.13 - - pytorch::pytorch - s3fs - setuptools - tensorboard - - pytorch::torchaudio - - pytorch::torchvision - pynvml - pip: + - --extra-index-url + - https://download.pytorch.org/whl/cu129 + - torch + - torchvision + - torchaudio - dask-saturn - saturn-client - saturnfs From 85c94ac3b09afcfef77c49c10f0671bd57ec7ac9 Mon Sep 17 00:00:00 2001 From: hugo Date: Thu, 28 May 2026 17:57:47 +0000 Subject: [PATCH 06/11] Pin python=3.13 across all py images, fix pytorch index-url syntax Mamba was resolving python=3.14 on conda-forge for any env without an explicit python pin (3.14 released 2025-10-07). #471 already pinned pytorch and tensorflow. This finishes the sweep: - saturn-python: 3.11 -> 3.13. - saturn-python-rapids: unpinned -> 3.13 (this image was on a path to hit the same 3.14 problem next build). - saturn-python-llm: 3.11 -> 3.13. Also drops pytorch/nvidia conda channels and the conda pytorch/pytorch-cuda/cuda-toolkit deps for the same reason the saturn-python-pytorch image did in #472: the pytorch conda channel is frozen at 2.5.1 and has no cp313 builds. torch/torchvision/torchaudio move to pip via the PyPI cu129 index. flash_attn URL swaps cp311 -> cp313 (same build family already publishes a cp313 wheel at that tag). - saturn-python-pytorch: collapses the two-line --extra-index-url pip arg into the correct single-line form. The split-line form is not the conda env yml pip-args grammar -- pip parses it as a bare --extra-index-url with no value. Leaving saturn-python-312-slim* alone: their names encode python312 and they're already pinned to 3.12. R images keep python=3.11 since python there is secondary tooling, not the image purpose. Co-Authored-By: Claude Opus 4.7 (1M context) --- saturn-python-llm/environment.yml | 15 +++++++-------- saturn-python-pytorch/environment.yml | 16 ++++++---------- saturn-python-rapids/environment.yml | 2 +- saturn-python/environment.yml | 2 +- 4 files changed, 15 insertions(+), 20 deletions(-) diff --git a/saturn-python-llm/environment.yml b/saturn-python-llm/environment.yml index 2b31677..74d1bf0 100644 --- a/saturn-python-llm/environment.yml +++ b/saturn-python-llm/environment.yml @@ -1,14 +1,9 @@ name: saturn channels: - - pytorch - - nvidia - conda-forge - - defaults + - nodefaults dependencies: - - python=3.11 - - cuda-toolkit - - pytorch - - pytorch-cuda + - python=3.13 - transformers - tokenizers - numpy @@ -32,6 +27,10 @@ dependencies: - ipykernel - pip - pip: + - --extra-index-url https://download.pytorch.org/whl/cu129 + - torch + - torchvision + - torchaudio # Fine-tuning stack. unsloth pulls trl/peft/datasets transitively, but we # declare them explicitly so the image's training API surface is stable # against unsloth version bumps. The Token Factory fine-tune training @@ -50,7 +49,7 @@ dependencies: - bitsandbytes - auto-gptq - autoawq - - https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.0.post2/flash_attn-2.8.0.post2%2Bcu12torch2.7cxx11abiTRUE-cp311-cp311-linux_x86_64.whl + - https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.0.post2/flash_attn-2.8.0.post2%2Bcu12torch2.7cxx11abiTRUE-cp313-cp313-linux_x86_64.whl - xformers - gpustat - nvidia-ml-py diff --git a/saturn-python-pytorch/environment.yml b/saturn-python-pytorch/environment.yml index 5186efc..f5fb7b9 100644 --- a/saturn-python-pytorch/environment.yml +++ b/saturn-python-pytorch/environment.yml @@ -1,15 +1,11 @@ name: saturn channels: - - pytorch - - rapidsai - - nvidia - - nodefaults - conda-forge + - nodefaults dependencies: + - python=3.13 - blas=*=mkl - bokeh - - pytorch::pytorch-cuda=12.1 - - dask-cuda - dask - fastai - fsspec @@ -23,15 +19,15 @@ dependencies: - py-opencv - pyarrow - python-graphviz - - python=3.13 - - pytorch::pytorch - s3fs - setuptools - tensorboard - - pytorch::torchaudio - - pytorch::torchvision - pynvml - pip: + - --extra-index-url https://download.pytorch.org/whl/cu129 + - torch + - torchvision + - torchaudio - dask-saturn - saturn-client - saturnfs diff --git a/saturn-python-rapids/environment.yml b/saturn-python-rapids/environment.yml index 9a48ff7..3e1b995 100644 --- a/saturn-python-rapids/environment.yml +++ b/saturn-python-rapids/environment.yml @@ -21,7 +21,7 @@ dependencies: - prefect - pyarrow - python-graphviz - - python + - python=3.13 - rapids - s3fs - scikit-learn diff --git a/saturn-python/environment.yml b/saturn-python/environment.yml index b7513db..3c9e968 100644 --- a/saturn-python/environment.yml +++ b/saturn-python/environment.yml @@ -14,7 +14,7 @@ dependencies: - pandas - pip - pyarrow - - python=3.11 + - python=3.13 - python-graphviz - s3fs - scikit-learn From a739062571b48e407a705a884877164258dc0092 Mon Sep 17 00:00:00 2001 From: hugo Date: Thu, 28 May 2026 19:13:30 +0000 Subject: [PATCH 07/11] saturn-python-llm: pin python=3.12, drop auto-gptq/autoawq, bump flash_attn The python=3.13 path is blocked by axolotl 0.16.1's unconditional zstandard==0.22.0 transitive pin -- that wheel only ships cp310-cp312, no cp313. Stepping down to python=3.12 keeps the rest of the axolotl 0.16.1 pin chain (torch==2.8.0, transformers==5.5.0, accelerate==1.13.0, bitsandbytes==0.49.1, datasets==4.5.0, trl==0.29.0) resolvable from pre-built wheels. Also: - Drop auto-gptq and autoawq. auto-gptq's sdist runs `import torch` at build-deps phase before torch is installed, breaking the env. autoawq has the same kind of issue. Neither is referenced by any saturncloud code; vllm handles GPTQ/AWQ checkpoint loading via compressed-tensors without these libs. - Bump flash_attn to v2.8.3 with the cu12torch2.8 cp312 wheel. The cu12torch2.7 wheel was ABI-incompatible with torch 2.8.0 (undefined symbol _ZN3c104cuda9SetDeviceEa). v2.8.3 is the version axolotl 0.16.1 itself wants under its flash-attn extra. Verified locally: env builds cleanly, all heavy imports (torch, transformers, axolotl, flash_attn, vllm, peft, trl, datasets, accelerate, bitsandbytes) succeed. Co-Authored-By: Claude Opus 4.7 (1M context) --- saturn-python-llm/environment.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/saturn-python-llm/environment.yml b/saturn-python-llm/environment.yml index 74d1bf0..c64f0b5 100644 --- a/saturn-python-llm/environment.yml +++ b/saturn-python-llm/environment.yml @@ -3,7 +3,7 @@ channels: - conda-forge - nodefaults dependencies: - - python=3.13 + - python=3.12 - transformers - tokenizers - numpy @@ -47,9 +47,7 @@ dependencies: - sentence-transformers - accelerate - bitsandbytes - - auto-gptq - - autoawq - - https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.0.post2/flash_attn-2.8.0.post2%2Bcu12torch2.7cxx11abiTRUE-cp313-cp313-linux_x86_64.whl + - https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3%2Bcu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl - xformers - gpustat - nvidia-ml-py From faf7dabb50faf97dc3ab53d57fb16b177d40b216 Mon Sep 17 00:00:00 2001 From: hugo Date: Thu, 28 May 2026 19:32:56 +0000 Subject: [PATCH 08/11] saturn-python-rapids: bump cuda to 12.9, drop dask-sql, pin rapids>=26.02 Three independent failures in the existing env on python=3.13: - cuda-version=12.0 was no longer in conda-forge (only 12.4+ ships now), so the previous CI build couldn't even resolve a CUDA package set. Bump to 12.9 to match the gpu-12.9 base we already have for the pytorch image; rapids' bundled CUDA libs don't actually need to match the runtime base. - dask-sql is abandoned upstream and tops out at python 3.12 on conda-forge. With our python=3.13 sweep it pulled the solver into 2021-era versions. No saturncloud code references dask-sql; users who want SQL-on-dask can pip-install it on demand. - Pinning rapids unbounded resolved to 25.08, which embeds an older cuml that broke at import time against the newer scikit-learn 1.8 conda-forge ships (BaseEstimator._get_default_requests was renamed to _get_metadata_request). rapids>=26.02 resolves to 26.04 with a cuml that matches. Verified locally: env builds clean, all heavy imports (cudf, cuml, cupy, dask, dask_ml, sklearn, pyarrow, cvxpy, prefect, numba) succeed. Co-Authored-By: Claude Opus 4.7 (1M context) --- saturn-python-rapids/environment.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/saturn-python-rapids/environment.yml b/saturn-python-rapids/environment.yml index 3e1b995..31c9416 100644 --- a/saturn-python-rapids/environment.yml +++ b/saturn-python-rapids/environment.yml @@ -6,10 +6,9 @@ channels: - conda-forge dependencies: - bokeh - - cuda-version=12.0 + - cuda-version=12.9 - cvxpy - dask-ml - - dask-sql - dask - ipykernel - ipywidgets @@ -22,7 +21,7 @@ dependencies: - pyarrow - python-graphviz - python=3.13 - - rapids + - rapids>=26.02 - s3fs - scikit-learn - scipy From 621e308cd3f39f462b90d163bf10591bfbd1b219 Mon Sep 17 00:00:00 2001 From: hugo Date: Thu, 28 May 2026 20:08:20 +0000 Subject: [PATCH 09/11] saturn-python-llm: pin vllm==0.11.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI's pip resolver walks vllm back through versions looking for one that satisfies the full constraint set (torch==2.8.0 from axolotl 0.16.1 narrows the window significantly). Locally it lands at 0.11.0; in the build container it kept walking past 0.11.0's manylinux1 wheel and eventually fell into vllm 0.5.x sdists, which try to call /usr/local/cuda/bin/nvcc at metadata-extraction time — but the runtime base image doesn't ship nvcc. Pinning to 0.11.0 (the version the local solve already lands on) short-circuits the backtracking and keeps the resolution wheel-only. Co-Authored-By: Claude Opus 4.7 (1M context) --- saturn-python-llm/environment.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/saturn-python-llm/environment.yml b/saturn-python-llm/environment.yml index c64f0b5..b121af0 100644 --- a/saturn-python-llm/environment.yml +++ b/saturn-python-llm/environment.yml @@ -42,7 +42,10 @@ dependencies: # Pinned exactly: Token Factory's Atlas renderer is keyed to specific # axolotl YAML field names that change across versions. - axolotl==0.16.1 - - vllm + # Pinned: vllm walks back through versions otherwise — axolotl 0.16.1 forces + # torch==2.8.0, and only 0.10–0.11 satisfy that. Pin to keep CI's pip from + # backtracking past wheel-only releases into 0.5.x sdists (which need nvcc). + - vllm==0.11.0 - ray - sentence-transformers - accelerate From f8031f864ad52bb067b3cb4c646617b35a3e3f32 Mon Sep 17 00:00:00 2001 From: hugo Date: Sat, 30 May 2026 17:51:09 +0000 Subject: [PATCH 10/11] Pin transformers <5 in saturn-python-llm so vLLM 0.11 boots vLLM 0.11.0 reads tokenizer.all_special_tokens_extended at startup, which transformers 5.x removed -> AttributeError -> CrashLoopBackOff. The env was leaving transformers unpinned and resolving to 5.5.0, breaking every vLLM serve pod built on this image. Pin transformers>=4.55,<5 (conda + repeated in the pip: block). 4.57.6 was empirically verified to boot vLLM 0.11.0, load Qwen2.5-7B + a LoRA adapter, and serve /v1/chat/completions. axolotl 0.16.1 (kept exactly pinned: TF's Atlas YAML renderer is keyed to its field names) carries an over-strict transformers==5.5.0 metadata pin that would drag 5.x back in. It runs fine on transformers 4.57.x, so it is now installed --no-deps in a separate Dockerfile step, with its real transitive deps declared explicitly in environment.yml. The cross-constraint still holds: axolotl 0.16.1 forces torch==2.8.0 and only vLLM 0.10-0.11 satisfy that, so vLLM stays pinned at 0.11.0. Co-Authored-By: Claude Opus 4.8 --- saturn-python-llm/Dockerfile | 6 ++++++ saturn-python-llm/environment.yml | 35 +++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/saturn-python-llm/Dockerfile b/saturn-python-llm/Dockerfile index 6fb0d73..335b23b 100644 --- a/saturn-python-llm/Dockerfile +++ b/saturn-python-llm/Dockerfile @@ -6,7 +6,13 @@ RUN sudo apt-get -qq --allow-releaseinfo-change update && \ libgl1 COPY environment.yml /tmp/environment.yml +# axolotl 0.16.1 is installed --no-deps so its over-strict transformers==5.5.0 +# metadata pin cannot drag transformers 5.x in and break vLLM 0.11 at boot. +# Its transitive deps are declared explicitly in environment.yml. This is a +# separate step because a --no-deps line inside the env.yml pip: block would +# apply to the whole block, suppressing deps for every pip entry. RUN mamba env update -n saturn --file /tmp/environment.yml && \ + ${CONDA_DIR}/envs/saturn/bin/python -m pip install --no-deps axolotl==0.16.1 && \ ${CONDA_DIR}/envs/saturn/bin/python -m ipykernel install \ --name python3 \ --display-name 'saturn (Python 3)' \ diff --git a/saturn-python-llm/environment.yml b/saturn-python-llm/environment.yml index b121af0..bc0455f 100644 --- a/saturn-python-llm/environment.yml +++ b/saturn-python-llm/environment.yml @@ -4,7 +4,12 @@ channels: - nodefaults dependencies: - python=3.12 - - transformers + # vLLM 0.11 boots only on transformers 4.x: transformers 5.x removed + # tokenizer.all_special_tokens_extended, which vLLM 0.11 reads at startup, + # so 5.x triggers AttributeError -> CrashLoopBackOff. axolotl 0.16.1 runs + # fine on transformers 4.57.x at runtime (its metadata pins 5.5.0, but that + # is over-strict; see the pip: block where axolotl is installed --no-deps). + - transformers>=4.55,<5 - tokenizers - numpy - psutil @@ -39,9 +44,31 @@ dependencies: - trl - peft - datasets - # Pinned exactly: Token Factory's Atlas renderer is keyed to specific - # axolotl YAML field names that change across versions. - - axolotl==0.16.1 + # Held below transformers 5 to match the conda transformers pin above (the + # pip: block is run through pip by `mamba env update`, so repeat the bound + # here to stop pip backtracking to 5.x). + - transformers>=4.55,<5 + # axolotl 0.16.1 transitive deps. axolotl itself is installed separately in + # the Dockerfile with --no-deps, because its metadata carries an over-strict + # `transformers==5.5.0` pin that would otherwise drag transformers 5.x back + # in and break vLLM 0.11 at boot. axolotl runs fine on transformers 4.57.x; + # its real transitive deps are declared here (and via unsloth/vllm/trl/peft). + # NOTE: --no-deps cannot be scoped to a single entry inside this block (pip + # applies it to the whole `pip install` invocation), so axolotl is pulled + # out into its own `pip install --no-deps axolotl==0.16.1` Dockerfile step. + - liger-kernel==0.7.0 + - lm_eval==0.4.11 + - fla-core==0.4.1 + - flash-linear-attention==0.4.1 + - torchao==0.17.0 + - optimum==1.16.2 + - trackio>=0.16.1 + - schedulefree==1.4.1 + - axolotl-contribs-lgpl==0.0.7 + - axolotl-contribs-mit==0.0.6 + - openenv-core==0.1.0 + - mistral-common==1.11.0 + - modal==1.3.0.post1 # Pinned: vllm walks back through versions otherwise — axolotl 0.16.1 forces # torch==2.8.0, and only 0.10–0.11 satisfy that. Pin to keep CI's pip from # backtracking past wheel-only releases into 0.5.x sdists (which need nvcc). From b8928894c7fee1e911e8bde372ccd076f2343cad Mon Sep 17 00:00:00 2001 From: hugo Date: Thu, 4 Jun 2026 22:04:41 +0000 Subject: [PATCH 11/11] Add saturn-python-vllm + saturn-python-axolotl (split from saturn-python-llm) saturn-python-llm tried to be one image for both vLLM serving and axolotl fine-tuning, but the dep stacks are incompatible: vLLM 0.11 needs transformers<5 (5.x removed tokenizer.all_special_tokens_extended -> CrashLoopBackOff), while axolotl 0.16.1 needs the transformers 5.x API (Trainer.create_optimizer(model=) is 5.x-only; on 4.57 training dies inside the loop). Split by engine, extensible to future inference engines / fine-tuning frameworks: - saturn-python-vllm: inference (vLLM, transformers<5; axolotl deps removed, no --no-deps hack) - saturn-python-axolotl: fine-tuning (axolotl installed WITH deps, so it pulls the correct transformers 5.5 / datasets 4.5 / trl 0.29 / hf-hub>=1 stack; flash-attn + deepspeed + mlflow extras) Both build on the cu129 GPU base. Registered for building in saturncloud/release-images (PR adds them to data_science.py / main_release.py / the build matrix). Co-Authored-By: Claude Opus 4.8 --- saturn-python-axolotl/Dockerfile | 25 +++++++++ saturn-python-axolotl/Makefile | 9 ++++ saturn-python-axolotl/environment.yml | 53 ++++++++++++++++++ saturn-python-axolotl/recipe-template.json | 6 +++ saturn-python-vllm/Dockerfile | 20 +++++++ saturn-python-vllm/Makefile | 9 ++++ saturn-python-vllm/environment.yml | 62 ++++++++++++++++++++++ saturn-python-vllm/recipe-template.json | 6 +++ 8 files changed, 190 insertions(+) create mode 100644 saturn-python-axolotl/Dockerfile create mode 100644 saturn-python-axolotl/Makefile create mode 100644 saturn-python-axolotl/environment.yml create mode 100644 saturn-python-axolotl/recipe-template.json create mode 100644 saturn-python-vllm/Dockerfile create mode 100644 saturn-python-vllm/Makefile create mode 100644 saturn-python-vllm/environment.yml create mode 100644 saturn-python-vllm/recipe-template.json diff --git a/saturn-python-axolotl/Dockerfile b/saturn-python-axolotl/Dockerfile new file mode 100644 index 0000000..f4f04e1 --- /dev/null +++ b/saturn-python-axolotl/Dockerfile @@ -0,0 +1,25 @@ +ARG SATURNBASE_GPU_IMAGE +FROM ${SATURNBASE_GPU_IMAGE} + +RUN sudo apt-get -qq --allow-releaseinfo-change update && \ + sudo apt-get -qq install --yes --no-install-recommends \ + libgl1 + +COPY environment.yml /tmp/environment.yml +# Unlike the serving image (saturn-python-llm), axolotl is installed WITH its +# deps here: this image has no vLLM, so there is no transformers<5 constraint to +# protect, and axolotl 0.16.1 needs the transformers 5.x API at runtime. Letting +# it resolve its own tree (transformers 5.5, datasets 4.5, trl 0.29, hf-hub>=1, +# accelerate 1.13, ...) is what makes training actually work. axolotl is declared +# in environment.yml's pip: block with its extras, so a normal env update pulls +# everything; no separate --no-deps step. +RUN mamba env update -n saturn --file /tmp/environment.yml && \ + ${CONDA_DIR}/envs/saturn/bin/python -m ipykernel install \ + --name python3 \ + --display-name 'saturn (Python 3)' \ + --prefix=${CONDA_DIR} && \ + ${CONDA_DIR}/bin/conda clean -afy && \ + find ${CONDA_DIR} -type f,l -name '*.pyc' -delete && \ + find ${CONDA_DIR} -type f,l -name '*.a' -delete && \ + find ${CONDA_DIR} -type f,l -name '*.js.map' -delete +RUN echo '' > ${CONDA_DIR}/envs/saturn/conda-meta/history diff --git a/saturn-python-axolotl/Makefile b/saturn-python-axolotl/Makefile new file mode 100644 index 0000000..aba3758 --- /dev/null +++ b/saturn-python-axolotl/Makefile @@ -0,0 +1,9 @@ +include .env_deps +export + +build_image: + docker build \ + --no-cache \ + --build-arg SATURNBASE_GPU_IMAGE=${SATURNBASE_GPU_IMAGE} \ + -t ${IMAGE} \ + . diff --git a/saturn-python-axolotl/environment.yml b/saturn-python-axolotl/environment.yml new file mode 100644 index 0000000..54f933c --- /dev/null +++ b/saturn-python-axolotl/environment.yml @@ -0,0 +1,53 @@ +name: saturn +channels: + - conda-forge + - nodefaults +dependencies: + - python=3.12 + # TRAINING-ONLY image (axolotl). Split out from saturn-python-llm because that + # image must pin transformers<5 for vLLM 0.11 serving, but axolotl 0.16.1 is + # hard-coupled to the transformers 5.x API (e.g. Trainer.create_optimizer(model=) + # — a 5.x-only signature; on 4.57 training dies inside the loop). The two dep + # stacks are incompatible in one env, so serving lives in saturn-python-llm and + # training lives here. Because there is NO vLLM here, axolotl is installed WITH + # its deps (see Dockerfile) — no transformers pin, no --no-deps hack — and it + # pulls the correct transformers 5.5 / datasets 4.5 / trl 0.29 / hf-hub>=1 set. + - numpy + - psutil + - pandas + - tqdm + - click + - rich + - tensorboard + - wandb + - ipykernel + - pip + - pip: + - --extra-index-url https://download.pytorch.org/whl/cu129 + # axolotl 0.16.1 pins torch==2.8.0; install it from the cu129 index so the + # GPU build is used (and so the flash-attn wheel below matches torch 2.8). + - torch==2.8.0 + - torchvision + - torchaudio + # The whole fine-tuning stack. Unlike the serving image, we let axolotl + # resolve its own dependency tree (transformers 5.5.0, datasets 4.5.0, + # trl 0.29.0, accelerate 1.13.0, peft, hf-hub>=1, etc.) — installed WITH + # deps in the Dockerfile. [flash-attn] + [deepspeed] extras for real + # multi-GPU LoRA/full fine-tunes; [mlflow] for experiment tracking. + - axolotl[flash-attn,deepspeed,mlflow]==0.16.1 + # flash-attn's build wants torch present at install time; ship the prebuilt + # cu12/torch2.8 wheel so it doesn't compile from source (slow, needs nvcc). + - https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3%2Bcu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl + # Saturn workspace + ops basics (mirror the serving image's tail). + - gpustat + - black + - isort + - mypy + - pytest + - saturn-client + # The Token Factory inline training script (pdc/scripts/tf/finetune.py) only + # needs requests + PyYAML at runtime; both come in transitively (requests via + # axolotl/saturn-client, PyYAML via axolotl). Listed here for clarity / in + # case axolotl ever drops them. + - requests + - pyyaml diff --git a/saturn-python-axolotl/recipe-template.json b/saturn-python-axolotl/recipe-template.json new file mode 100644 index 0000000..554d645 --- /dev/null +++ b/saturn-python-axolotl/recipe-template.json @@ -0,0 +1,6 @@ +{ + "name": "saturn-python-axolotl", + "description": "Fine-tuning LLMs with axolotl (transformers 5 training stack)", + "hardware_type": "gpu", + "supports": ["jupyterlab", "dask"] +} diff --git a/saturn-python-vllm/Dockerfile b/saturn-python-vllm/Dockerfile new file mode 100644 index 0000000..fae3dd9 --- /dev/null +++ b/saturn-python-vllm/Dockerfile @@ -0,0 +1,20 @@ +ARG SATURNBASE_GPU_IMAGE +FROM ${SATURNBASE_GPU_IMAGE} + +RUN sudo apt-get -qq --allow-releaseinfo-change update && \ + sudo apt-get -qq install --yes --no-install-recommends \ + libgl1 + +COPY environment.yml /tmp/environment.yml +# vLLM serving image. No axolotl here (it lives in saturn-python-axolotl), so the +# former --no-deps axolotl install step is gone and a plain env update suffices. +RUN mamba env update -n saturn --file /tmp/environment.yml && \ + ${CONDA_DIR}/envs/saturn/bin/python -m ipykernel install \ + --name python3 \ + --display-name 'saturn (Python 3)' \ + --prefix=${CONDA_DIR} && \ + ${CONDA_DIR}/bin/conda clean -afy && \ + find ${CONDA_DIR} -type f,l -name '*.pyc' -delete && \ + find ${CONDA_DIR} -type f,l -name '*.a' -delete && \ + find ${CONDA_DIR} -type f,l -name '*.js.map' -delete +RUN echo '' > ${CONDA_DIR}/envs/saturn/conda-meta/history diff --git a/saturn-python-vllm/Makefile b/saturn-python-vllm/Makefile new file mode 100644 index 0000000..aba3758 --- /dev/null +++ b/saturn-python-vllm/Makefile @@ -0,0 +1,9 @@ +include .env_deps +export + +build_image: + docker build \ + --no-cache \ + --build-arg SATURNBASE_GPU_IMAGE=${SATURNBASE_GPU_IMAGE} \ + -t ${IMAGE} \ + . diff --git a/saturn-python-vllm/environment.yml b/saturn-python-vllm/environment.yml new file mode 100644 index 0000000..e1379b8 --- /dev/null +++ b/saturn-python-vllm/environment.yml @@ -0,0 +1,62 @@ +name: saturn +channels: + - conda-forge + - nodefaults +dependencies: + - python=3.12 + # INFERENCE/SERVING image (vLLM). Split out from the former saturn-python-llm + # (which tried to be one image for both training and serving). vLLM 0.11 boots + # only on transformers 4.x: transformers 5.x removed + # tokenizer.all_special_tokens_extended, which vLLM 0.11 reads at startup, so + # 5.x -> AttributeError -> CrashLoopBackOff. Fine-tuning (axolotl, which needs + # the transformers 5.x API) now lives in saturn-python-axolotl, so this image + # is free to pin transformers<5 without breaking training. + - transformers>=4.55,<5 + - tokenizers + - numpy + - psutil + - pydantic + - fastapi + - uvicorn + - aiohttp + - requests + - typing-extensions + - packaging + - filelock + - matplotlib + - pandas + - seaborn + - tqdm + - click + - rich + - tensorboard + - wandb + - ipykernel + - pip + - pip: + - --extra-index-url https://download.pytorch.org/whl/cu129 + - torch + - torchvision + - torchaudio + # Held below transformers 5 to match the conda pin above (the pip: block is + # run through pip by `mamba env update`, so repeat the bound to stop pip + # backtracking to 5.x). + - transformers>=4.55,<5 + # vLLM serving stack. + - vllm==0.11.0 + - ray + - sentence-transformers + # peft so vLLM can load the LoRA adapters Token Factory fine-tunes produce. + - peft + - accelerate + - bitsandbytes + - https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3%2Bcu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl + - xformers + - gpustat + - nvidia-ml-py + - huggingface-hub + - black + - isort + - mypy + - pytest + - saturn-client diff --git a/saturn-python-vllm/recipe-template.json b/saturn-python-vllm/recipe-template.json new file mode 100644 index 0000000..9409275 --- /dev/null +++ b/saturn-python-vllm/recipe-template.json @@ -0,0 +1,6 @@ +{ + "name": "saturn-python-vllm", + "description": "Serving LLMs with vLLM (inference; transformers 4 stack)", + "hardware_type": "gpu", + "supports": ["jupyterlab", "dask"] +}