From 3c114cb05c44e7a084db81a121c4a29985e7d716 Mon Sep 17 00:00:00 2001 From: Parker Bibus Date: Tue, 30 Jun 2026 14:30:20 -0700 Subject: [PATCH 1/3] Pre-provision ML.NET SSWE model into Helix payload to fix mlnet timeouts The mlnet performance benchmarks (StochasticDualCoordinateAscentClassifierBench.TrainSentiment) apply a pretrained SSWE word embedding that ML.NET downloads (~70 MB) from aka.ms/mlnet-resources at benchmark runtime. That download stalls on the Helix machines, hanging the entire mlnet work item until it times out and is killed, discarding all mlnet results so every mlnet benchmark appears to fail. Download the model on the build agent (reliable connectivity) into the correlation payload and point MICROSOFTML_RESOURCE_PATH at it via the Helix pre-commands, removing the runtime network dependency. Best-effort and strictly gated on run_kind == mlnet, so non-mlnet runs are unaffected and a download failure falls back to prior behavior. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- scripts/run_performance_job.py | 76 ++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/scripts/run_performance_job.py b/scripts/run_performance_job.py index 47dee990e8b..8ca75feb3b6 100644 --- a/scripts/run_performance_job.py +++ b/scripts/run_performance_job.py @@ -9,6 +9,7 @@ from subprocess import CalledProcessError import sys import tempfile +import time from traceback import format_exc import urllib.request import xml.etree.ElementTree as ET @@ -120,6 +121,64 @@ class RunPerformanceJobArgs: live_libraries_build_config: Optional[str] = None cross_build: bool = False +# Subdirectory (inside the Helix correlation payload) that holds the pre-downloaded ML.NET resources. +# On the Helix machine this is referenced as /mlnet-resources. +MLNET_RESOURCES_PAYLOAD_SUBDIR = "mlnet-resources" + +def try_provision_mlnet_resources(payload_dir: str) -> bool: + """ + Pre-download the ML.NET SSWE word-embedding model into the correlation payload. + + The Microsoft.ML.Benchmarks (StochasticDualCoordinateAscentClassifierBench.TrainSentiment) + apply a pretrained word embedding (SSWE 'sentiment.emd', ~70 MB). ML.NET downloads this model + from https://aka.ms/mlnet-resources at benchmark runtime if it isn't already present on disk. + On the Helix machines that download regularly stalls, which hangs the entire mlnet work item + until it hits the work item timeout and is killed, discarding ALL mlnet results (so every mlnet + benchmark appears to fail). + + To remove the runtime network dependency, we download the model here on the build agent (which + has reliable connectivity) into the correlation payload. The caller then points + MICROSOFTML_RESOURCE_PATH at it on the Helix machine so ML.NET loads the model from disk and + never makes the network call. + + ML.NET resolves the model at /Text/Sswe/sentiment.emd. + + This is best-effort: if the download fails the function returns False and the caller skips + setting the env var, leaving the previous (runtime-download) behavior unchanged. + """ + resource_root = os.path.join(payload_dir, MLNET_RESOURCES_PAYLOAD_SUBDIR) + dest = os.path.join(resource_root, "Text", "Sswe", "sentiment.emd") + os.makedirs(os.path.dirname(dest), exist_ok=True) + + # The direct blob URL is the redirect target of the aka.ms link; prefer it to avoid the redirect, + # and fall back to the aka.ms link in case the blob path ever changes. + urls = [ + "https://mlpublicassets.blob.core.windows.net/assets/Text/Sswe/sentiment.emd", + "https://aka.ms/mlnet-resources/Text/Sswe/sentiment.emd", + ] + + last_error: Optional[Exception] = None + for attempt in range(1, 6): + for url in urls: + try: + getLogger().info(f"Downloading ML.NET SSWE model from {url} (attempt {attempt})") + with urllib.request.urlopen(url, timeout=300) as response, open(dest, "wb") as f: + shutil.copyfileobj(response, f) + size = os.path.getsize(dest) + if size <= 0: + raise Exception("downloaded file is empty") + getLogger().info(f"Downloaded ML.NET SSWE model ({size} bytes) to {dest}") + return True + except Exception as e: + last_error = e + getLogger().warning(f"Failed to download ML.NET SSWE model from {url}: {e}") + time.sleep(10) + + getLogger().warning( + "Could not pre-provision the ML.NET SSWE model into the payload after retries " + f"(last error: {last_error}); ML.NET will attempt to download it at benchmark runtime.") + return False + def get_pre_commands( os_group: str, os_distro: Optional[str], @@ -703,6 +762,14 @@ def run_performance_job(args: RunPerformanceJobArgs): getLogger().info("Copying performance repository to payload directory") shutil.copytree(args.performance_repo_dir, performance_payload_dir, ignore=shutil.ignore_patterns("CorrelationStaging", ".git", "artifacts", ".dotnet", ".venv", ".vs")) + # For ML.NET runs, pre-download the SSWE word-embedding model into the payload so the benchmarks + # don't have to fetch it from the network on the (flaky) Helix machines. See + # try_provision_mlnet_resources for details. The matching MICROSOFTML_RESOURCE_PATH env var is + # set in the Helix pre-commands below when this succeeds. + mlnet_resources_provisioned = False + if args.run_kind == "mlnet": + mlnet_resources_provisioned = try_provision_mlnet_resources(payload_dir) + if args.internal: creator = "" scenario_arguments = ["--upload-to-perflab-container"] @@ -1025,6 +1092,15 @@ def run_performance_job(args: RunPerformanceJobArgs): helix_pre_commands = get_pre_commands(args.os_group, args.os_distro, args.internal, args.runtime_type, args.codegen_type, args.build_config, v8_version) helix_post_commands = get_post_commands(args.os_group, args.internal, args.runtime_type) + # Point ML.NET at the SSWE model that was pre-downloaded into the correlation payload above, so it + # loads the word embedding from disk instead of downloading it at benchmark runtime (which hangs + # the work item on Helix). %HELIX_CORRELATION_PAYLOAD% is expanded by the shell at run time. + if mlnet_resources_provisioned: + if args.os_group == "windows": + helix_pre_commands += [f"set \"MICROSOFTML_RESOURCE_PATH=%HELIX_CORRELATION_PAYLOAD%\\{MLNET_RESOURCES_PAYLOAD_SUBDIR}\""] + else: + helix_pre_commands += [f"export MICROSOFTML_RESOURCE_PATH=$HELIX_CORRELATION_PAYLOAD/{MLNET_RESOURCES_PAYLOAD_SUBDIR}"] + ci_setup_arguments.local_build = args.local_build if args.affinity != "0": From 824daf294d21beb4d7c92bade7cc088b97d3d52a Mon Sep 17 00:00:00 2001 From: Parker Bibus Date: Tue, 30 Jun 2026 15:09:01 -0700 Subject: [PATCH 2/3] Tighten SSWE download validation and trim docstring Download to a temp file, validate the size against Content-Length (when present) and a minimum-size floor, then atomically replace the destination so a truncated or early-closed response can't leave a corrupt sentiment.emd in the payload. Also make the function docstring more concise. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- scripts/run_performance_job.py | 46 ++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/scripts/run_performance_job.py b/scripts/run_performance_job.py index 8ca75feb3b6..86622b51bf5 100644 --- a/scripts/run_performance_job.py +++ b/scripts/run_performance_job.py @@ -129,22 +129,15 @@ def try_provision_mlnet_resources(payload_dir: str) -> bool: """ Pre-download the ML.NET SSWE word-embedding model into the correlation payload. - The Microsoft.ML.Benchmarks (StochasticDualCoordinateAscentClassifierBench.TrainSentiment) - apply a pretrained word embedding (SSWE 'sentiment.emd', ~70 MB). ML.NET downloads this model - from https://aka.ms/mlnet-resources at benchmark runtime if it isn't already present on disk. - On the Helix machines that download regularly stalls, which hangs the entire mlnet work item - until it hits the work item timeout and is killed, discarding ALL mlnet results (so every mlnet - benchmark appears to fail). - - To remove the runtime network dependency, we download the model here on the build agent (which - has reliable connectivity) into the correlation payload. The caller then points - MICROSOFTML_RESOURCE_PATH at it on the Helix machine so ML.NET loads the model from disk and - never makes the network call. - + StochasticDualCoordinateAscentClassifierBench.TrainSentiment applies a pretrained word embedding + ('sentiment.emd', ~70 MB) that ML.NET otherwise downloads from https://aka.ms/mlnet-resources at + benchmark runtime. That download stalls on the Helix machines and hangs the whole mlnet work item + until it times out. Downloading it here on the build agent (reliable connectivity) and pointing + MICROSOFTML_RESOURCE_PATH at /mlnet-resources lets ML.NET load it from disk instead. ML.NET resolves the model at /Text/Sswe/sentiment.emd. - This is best-effort: if the download fails the function returns False and the caller skips - setting the env var, leaving the previous (runtime-download) behavior unchanged. + Best-effort: returns False on failure so the caller skips the env var and the previous + (runtime-download) behavior is left unchanged. """ resource_root = os.path.join(payload_dir, MLNET_RESOURCES_PAYLOAD_SUBDIR) dest = os.path.join(resource_root, "Text", "Sswe", "sentiment.emd") @@ -157,21 +150,36 @@ def try_provision_mlnet_resources(payload_dir: str) -> bool: "https://aka.ms/mlnet-resources/Text/Sswe/sentiment.emd", ] + # The model is ~70 MB; require at least this much so a truncated/early-closed response (which may + # not raise) is rejected instead of leaving a corrupt file in the payload. + min_expected_size = 60 * 1024 * 1024 + last_error: Optional[Exception] = None for attempt in range(1, 6): for url in urls: + tmp_dest = dest + ".tmp" try: getLogger().info(f"Downloading ML.NET SSWE model from {url} (attempt {attempt})") - with urllib.request.urlopen(url, timeout=300) as response, open(dest, "wb") as f: - shutil.copyfileobj(response, f) - size = os.path.getsize(dest) - if size <= 0: - raise Exception("downloaded file is empty") + with urllib.request.urlopen(url, timeout=300) as response: + content_length = response.getheader("Content-Length") + expected_size = int(content_length) if content_length else None + with open(tmp_dest, "wb") as f: + shutil.copyfileobj(response, f) + + size = os.path.getsize(tmp_dest) + if expected_size is not None and size != expected_size: + raise Exception(f"size {size} does not match Content-Length {expected_size}") + if size < min_expected_size: + raise Exception(f"size {size} is smaller than the expected minimum {min_expected_size}") + + os.replace(tmp_dest, dest) getLogger().info(f"Downloaded ML.NET SSWE model ({size} bytes) to {dest}") return True except Exception as e: last_error = e getLogger().warning(f"Failed to download ML.NET SSWE model from {url}: {e}") + if os.path.exists(tmp_dest): + os.remove(tmp_dest) time.sleep(10) getLogger().warning( From 5c0a1c2db6ac363322102d79a460b7b01180d91c Mon Sep 17 00:00:00 2001 From: Parker Bibus Date: Wed, 1 Jul 2026 12:19:40 -0700 Subject: [PATCH 3/3] Only enforce SSWE minimum-size floor when Content-Length is absent When the server sends a Content-Length, an exact match fully validates the download, so trust it regardless of size (the asset could legitimately shrink without becoming invalid). Only fall back to the minimum-size floor when no Content-Length is available. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- scripts/run_performance_job.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/scripts/run_performance_job.py b/scripts/run_performance_job.py index 86622b51bf5..90376f796e3 100644 --- a/scripts/run_performance_job.py +++ b/scripts/run_performance_job.py @@ -150,8 +150,9 @@ def try_provision_mlnet_resources(payload_dir: str) -> bool: "https://aka.ms/mlnet-resources/Text/Sswe/sentiment.emd", ] - # The model is ~70 MB; require at least this much so a truncated/early-closed response (which may - # not raise) is rejected instead of leaving a corrupt file in the payload. + # The model is ~70 MB. When the server doesn't send a Content-Length to validate against, require + # at least this much so a truncated/early-closed response (which may not raise) is rejected + # instead of leaving a corrupt file in the payload. min_expected_size = 60 * 1024 * 1024 last_error: Optional[Exception] = None @@ -167,9 +168,14 @@ def try_provision_mlnet_resources(payload_dir: str) -> bool: shutil.copyfileobj(response, f) size = os.path.getsize(tmp_dest) - if expected_size is not None and size != expected_size: - raise Exception(f"size {size} does not match Content-Length {expected_size}") - if size < min_expected_size: + if expected_size is not None: + # Content-Length fully validates completeness, so trust it regardless of size + # (the asset could legitimately shrink without becoming invalid). + if size != expected_size: + raise Exception(f"size {size} does not match Content-Length {expected_size}") + elif size < min_expected_size: + # No Content-Length to validate against; fall back to a minimum-size floor to + # reject an obviously truncated/early-closed response. raise Exception(f"size {size} is smaller than the expected minimum {min_expected_size}") os.replace(tmp_dest, dest)