From 8fbb67cf5a524b12eff4fda0cbcf0149c3ba67b1 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Thu, 11 Jun 2026 14:37:30 +0800 Subject: [PATCH 01/11] update --- autotest/config.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/autotest/config.yaml b/autotest/config.yaml index ba15121ed..6bfd70eae 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -352,6 +352,7 @@ case: - DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/ci_vl - MEDIA_ROOT=/mnt/shared-storage-user/llmrazor-share/data/ci_vl - XTUNER_DETERMINISTIC=true + - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0 assert_info: base_metric: qwen3-sft-vl-dense/tracker.jsonl check_metrics: @@ -769,6 +770,7 @@ case: - EVAL_DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/less_geometry3k/test.jsonl - MEDIA_ROOT=/mnt/shared-storage-user/llmrazor-share/data/less_geometry3k - XTUNER_DETERMINISTIC=true + - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0 assert_info: base_metric: qwen3-rl-vl-lmdeploy/tracker.jsonl check_metrics: @@ -813,6 +815,7 @@ case: - DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/rl_vl_meta.jsonl - XTUNER_USE_LMDEPLOY=1 - XTUNER_DETERMINISTIC=true + - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0 assert_info: base_metric: qwen3-5-rl-vl-lmdeploy-grpo/tracker.jsonl check_metrics: @@ -857,6 +860,7 @@ case: - DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/rl_vl_meta.jsonl - XTUNER_USE_LMDEPLOY=1 - XTUNER_DETERMINISTIC=true + - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0 assert_info: base_metric: qwen3-5-rl-vl-lmdeploy-dapo/tracker.jsonl check_metrics: @@ -902,6 +906,7 @@ case: - XTUNER_USE_LMDEPLOY=1 - XTUNER_DETERMINISTIC=true - XTUNER_GC_ENABLE=1 + - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0 assert_info: base_metric: qwen3-5-rl-vl-lmdeploy-resume/tracker.jsonl check_metrics: @@ -948,6 +953,7 @@ case: - XTUNER_USE_LMDEPLOY=1 - XTUNER_DETERMINISTIC=true - XTUNER_GC_ENABLE=1 + - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0 assert_info: base_metric: qwen3-5-rl-vl-lmdeploy-resume/tracker-resume.jsonl check_metrics: @@ -993,6 +999,7 @@ case: - EVAL_DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/gsm8k/test.jsonl - XTUNER_USE_LMDEPLOY=1 - XTUNER_DETERMINISTIC=true + - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0 assert_info: base_metric: qwen3-5-rl-lmdeploy-dapo/tracker.jsonl check_metrics: @@ -1041,7 +1048,6 @@ case: - XTUNER_USE_LMDEPLOY=0 - XTUNER_USE_VLLM=0 - XTUNER_USE_SGLANG=1 - - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0 - XTUNER_USE_FA3=0 assert_info: base_metric: qwen3-rl-sglang/tracker.jsonl From 40046311bd60e914af73f2ebc40274bda775f15a Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Thu, 11 Jun 2026 14:42:51 +0800 Subject: [PATCH 02/11] update --- .github/workflows/ete_test_gpu.yaml | 4 ++-- .github/workflows/ete_test_npu.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ete_test_gpu.yaml b/.github/workflows/ete_test_gpu.yaml index 009b22e64..d230206eb 100644 --- a/.github/workflows/ete_test_gpu.yaml +++ b/.github/workflows/ete_test_gpu.yaml @@ -65,9 +65,9 @@ jobs: unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy; CASE_NAME="${{ github.event.inputs.run_case || 'all' }}" if [ "$CASE_NAME" = "all" ]; then - pytest autotest/test_all.py -m all -n 1 -vv --run_id ${{ github.run_id }} + pytest autotest/test_all.py -m all -n 1 -s -vv --run_id ${{ github.run_id }} else - pytest autotest/test_all.py::test_all[$CASE_NAME] -m all -n 1 -vv --run_id ${{ github.run_id }} + pytest autotest/test_all.py::test_all[$CASE_NAME] -m all -n 1 -s -vv --run_id ${{ github.run_id }} fi - name: Check report files diff --git a/.github/workflows/ete_test_npu.yaml b/.github/workflows/ete_test_npu.yaml index 2ca5aaff2..47c2afb6e 100644 --- a/.github/workflows/ete_test_npu.yaml +++ b/.github/workflows/ete_test_npu.yaml @@ -61,9 +61,9 @@ jobs: unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy; CASE_NAME="${{ github.event.inputs.run_case || 'all' }}" if [ "$CASE_NAME" = "all" ]; then - export DEVICE=npu && pytest autotest/test_all.py -m all -n 1 -vv --run_id ${{ github.run_id }} + export DEVICE=npu && pytest autotest/test_all.py -m all -n 1 -s -vv --run_id ${{ github.run_id }} else - export DEVICE=npu && pytest autotest/test_all.py::test_all[$CASE_NAME] -m all -n 1 -vv --run_id ${{ github.run_id }} + export DEVICE=npu && pytest autotest/test_all.py::test_all[$CASE_NAME] -m all -n 1 -s -vv --run_id ${{ github.run_id }} fi - name: Check report files From 812be4718dd102eaa4b5c9189720790bd2efc6ee Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Thu, 11 Jun 2026 14:45:42 +0800 Subject: [PATCH 03/11] update --- autotest/config.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/autotest/config.yaml b/autotest/config.yaml index 6bfd70eae..1db4e2296 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -770,7 +770,6 @@ case: - EVAL_DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/less_geometry3k/test.jsonl - MEDIA_ROOT=/mnt/shared-storage-user/llmrazor-share/data/less_geometry3k - XTUNER_DETERMINISTIC=true - - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0 assert_info: base_metric: qwen3-rl-vl-lmdeploy/tracker.jsonl check_metrics: @@ -815,7 +814,6 @@ case: - DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/rl_vl_meta.jsonl - XTUNER_USE_LMDEPLOY=1 - XTUNER_DETERMINISTIC=true - - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0 assert_info: base_metric: qwen3-5-rl-vl-lmdeploy-grpo/tracker.jsonl check_metrics: @@ -860,7 +858,6 @@ case: - DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/rl_vl_meta.jsonl - XTUNER_USE_LMDEPLOY=1 - XTUNER_DETERMINISTIC=true - - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0 assert_info: base_metric: qwen3-5-rl-vl-lmdeploy-dapo/tracker.jsonl check_metrics: @@ -906,7 +903,6 @@ case: - XTUNER_USE_LMDEPLOY=1 - XTUNER_DETERMINISTIC=true - XTUNER_GC_ENABLE=1 - - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0 assert_info: base_metric: qwen3-5-rl-vl-lmdeploy-resume/tracker.jsonl check_metrics: @@ -953,7 +949,6 @@ case: - XTUNER_USE_LMDEPLOY=1 - XTUNER_DETERMINISTIC=true - XTUNER_GC_ENABLE=1 - - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0 assert_info: base_metric: qwen3-5-rl-vl-lmdeploy-resume/tracker-resume.jsonl check_metrics: @@ -999,7 +994,6 @@ case: - EVAL_DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/gsm8k/test.jsonl - XTUNER_USE_LMDEPLOY=1 - XTUNER_DETERMINISTIC=true - - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0 assert_info: base_metric: qwen3-5-rl-lmdeploy-dapo/tracker.jsonl check_metrics: From 9f4d171a6e782d04b7eedb8f8ed56a7c85af2d2e Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Thu, 11 Jun 2026 16:41:14 +0800 Subject: [PATCH 04/11] add new matric and fix repeated png --- autotest/config-npu.yaml | 2 + autotest/config.yaml | 8 +++ autotest/module/train.py | 124 +++++++++++++++++++++++++++----- autotest/utils/check_metric.py | 104 ++++++++++++++++++++++++--- autotest/utils/metric_report.py | 28 +++++--- 5 files changed, 233 insertions(+), 33 deletions(-) diff --git a/autotest/config-npu.yaml b/autotest/config-npu.yaml index 5661e6638..8a86d3720 100644 --- a/autotest/config-npu.yaml +++ b/autotest/config-npu.yaml @@ -56,6 +56,7 @@ case: npu-qwen3-sft-ep8: - type: sft + phase: first parameters: config: autotest/config/npu_qwen3_moe_30BA3_ep8.py output_path: /mnt/hwfile/llmrazor/qa-llm-cicd/test_output @@ -80,6 +81,7 @@ case: timeout: 10800 - type: sft + phase: resume pre_action: command: 'python ./autotest/utils/update_meta.py /mnt/hwfile/llmrazor/qa-llm-cicd/test_output npu-qwen3-sft-ep8 sft' parameters: diff --git a/autotest/config.yaml b/autotest/config.yaml index 1db4e2296..0e31b3bbd 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -54,6 +54,7 @@ case: qwen3-sft-ep8: - type: sft + phase: first parameters: config: autotest/config/qwen3_moe_30BA3_ep8.py output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output @@ -79,6 +80,7 @@ case: timeout: 1500 - type: sft + phase: resume pre_action: command: 'python ./autotest/utils/update_meta.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-sft-ep8 sft' parameters: @@ -475,6 +477,7 @@ case: qwen3-5-sft-sp4-resume: - type: sft + phase: first parameters: config: autotest/config/qwen3_5_moe_30BA3_sp4.py output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output @@ -499,6 +502,7 @@ case: - type: sft + phase: resume pre_action: command: 'python ./autotest/utils/update_meta.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-5-sft-sp4-resume sft' parameters: @@ -608,6 +612,7 @@ case: qwen3-5-sft-sp4-resume-vl: - type: sft + phase: first parameters: config: autotest/config/qwen3_5_moe_30BA3_sp4_vl.py output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output @@ -634,6 +639,7 @@ case: - type: sft + phase: resume pre_action: command: 'python ./autotest/utils/update_meta.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-5-sft-sp4-resume-vl sft' parameters: @@ -891,6 +897,7 @@ case: qwen3-5-rl-vl-lmdeploy-resume: - type: rl + phase: first parameters: config: autotest/config/rl_qwen3p5_vl_35B_dapo_ep2_resume.py infer_backend: lmdeploy @@ -935,6 +942,7 @@ case: - type: rl + phase: resume pre_action: command: 'python ./autotest/utils/update_meta.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-5-rl-vl-lmdeploy-resume rl' parameters: diff --git a/autotest/module/train.py b/autotest/module/train.py index 861221513..ff2668e8e 100644 --- a/autotest/module/train.py +++ b/autotest/module/train.py @@ -1,16 +1,22 @@ +import json import os +import shutil +from typing import Any from utils.check_metric import check_result, check_rl_result from utils.run_cmd import run_cmd +FIRST_RUN_TRACKER_SNAPSHOT = "_first_run_tracker.jsonl" + + class Train: def get_cmd(config): print(config) config_path = config.get("parameters").get("config") train_type = config.get("type") nproc_per_node = config.get("resource", {}).get("gpus_per_task", 8) - pip_package = config.get("resource", {}).get("pip_package", 'ls') + pip_package = config.get("resource", {}).get("pip_package", "ls") if train_type in ["sft", "rl"]: model_config = config.get("parameters", {}).get("model", None) config_path = config.get("parameters", {}).get("config", None) @@ -70,22 +76,28 @@ def get_cmd(config): def validate(config): work_dir = config.get("work_dir", None) - base_path = os.path.join( - config.get("base_path").get("base_baseline_path"), config.get("assert_info", {}).get("base_metric", None) - ) + base_metric = config.get("assert_info", {}).get("base_metric", None) + base_path = os.path.join(config.get("base_path").get("base_baseline_path"), base_metric) train_type = config.get("type") + case_name = config["case_name"] + phase = config.get("phase") + context = config.get("context", {}) + + cur_path = resolve_tracker_path(work_dir, train_type, phase, context=context) + if train_type == "sft": - cur_path = os.path.join(get_latest_subdir(work_dir), "logs/exp_tracking/rank0/tracker.jsonl") check_metrics = config.get("assert_info", {}).get("check_metrics", {}) - return check_result(config["case_name"], base_path, cur_path, check_metrics) + result = check_result(case_name, base_path, cur_path, check_metrics, phase=phase) elif train_type == "rl": - cur_path = os.path.join(get_latest_subdir(work_dir), "logs/exp_tracking/tracker.jsonl") check_metrics = config.get("assert_info", {}) - return check_rl_result(config["case_name"], base_path, cur_path, check_metrics) + result = check_rl_result(case_name, base_path, cur_path, check_metrics, phase=phase) else: print("Unknown type: {train_type}") return False + snapshot_first_run_tracker(work_dir, phase, cur_path, context=context) + return result + def pre_action(config=None): action_info = config.get("pre_action", None) if action_info: @@ -101,12 +113,92 @@ def post_action(config=None): run_cmd(action_cmd) -def get_latest_subdir(work_dir): - dirs = [ - d for d in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, d)) and len(d) == 14 and d.isdigit() - ] +def list_timestamp_subdirs(work_dir: str) -> list[str]: + return sorted( + name + for name in os.listdir(work_dir) + if os.path.isdir(os.path.join(work_dir, name)) and len(name) == 14 and name.isdigit() + ) + + +def _tracker_relpath(train_type: str) -> str: + if train_type == "sft": + return "logs/exp_tracking/rank0/tracker.jsonl" + return "logs/exp_tracking/tracker.jsonl" + + +def _tracker_path(exp_dir: str | None, train_type: str) -> str: + return os.path.join(exp_dir, _tracker_relpath(train_type)) + + +def _snapshot_path(work_dir: str) -> str: + return os.path.join(work_dir, FIRST_RUN_TRACKER_SNAPSHOT) + + +def _write_first_run_segment(src: str, dst: str) -> None: + os.makedirs(os.path.dirname(dst), exist_ok=True) + seen_steps: set[Any] = set() + with open(src, encoding="utf-8") as fin, open(dst, "w", encoding="utf-8") as fout: + for line in fin: + if not line.strip(): + continue + step = json.loads(line).get("step") + if step in seen_steps: + break + seen_steps.add(step) + fout.write(line if line.endswith("\n") else f"{line}\n") + + +def _has_duplicate_steps(tracker_path: str) -> bool: + steps: list[Any] = [] + with open(tracker_path, encoding="utf-8") as f: + for line in f: + if line.strip(): + steps.append(json.loads(line).get("step")) + return len(steps) != len(set(steps)) + + +def resolve_tracker_path( + work_dir: str, + train_type: str, + phase: str | None, + context: dict[str, Any] | None = None, +) -> str: + context = context or {} + snapshot = context.get("first_run_tracker") or _snapshot_path(work_dir) + + if phase == "first": + if os.path.isfile(snapshot): + return snapshot + + subdirs = list_timestamp_subdirs(work_dir) + if len(subdirs) > 1: + exp_dir = os.path.join(work_dir, subdirs[0]) + else: + exp_dir = os.path.join(work_dir, subdirs[-1]) if subdirs else None + live_tracker = _tracker_path(exp_dir, train_type) + + if os.path.isfile(live_tracker) and _has_duplicate_steps(live_tracker): + _write_first_run_segment(live_tracker, snapshot) + if os.path.isfile(snapshot) and os.path.getsize(snapshot) > 0: + return snapshot + return live_tracker + + subdirs = list_timestamp_subdirs(work_dir) + exp_dir = os.path.join(work_dir, subdirs[-1]) if subdirs else None + return _tracker_path(exp_dir, train_type) + - if not dirs: - return None - latest = max(dirs, key=lambda d: os.path.getmtime(os.path.join(work_dir, d))) - return os.path.join(work_dir, latest) +def snapshot_first_run_tracker( + work_dir: str, + phase: str | None, + cur_path: str, + context: dict[str, Any] | None = None, +) -> None: + if phase != "first" or not os.path.isfile(cur_path): + return + snapshot = _snapshot_path(work_dir) + if cur_path != snapshot: + shutil.copy2(cur_path, snapshot) + if context is not None: + context["first_run_tracker"] = snapshot diff --git a/autotest/utils/check_metric.py b/autotest/utils/check_metric.py index 00814a32e..fe0a0be12 100644 --- a/autotest/utils/check_metric.py +++ b/autotest/utils/check_metric.py @@ -10,6 +10,13 @@ ) logger = logging.getLogger(__name__) +MEMORY_GRADIENT_WARMUP_STEPS = 5 +MEMORY_GRADIENT_MIN_SEGMENT_LEN = 8 +MEMORY_GRADIENT_POSITIVE_RATIO = 0.65 +MEMORY_GRADIENT_MIN_SLOPE_GB = 1e-4 +MEMORY_GRADIENT_MIN_REL_DRIFT = 0.00015 +MEMORY_GRADIENT_RESUME_DROP_GB = 0.005 + def extract_value(file, metrics): metric_all = {metric: [] for metric in metrics} @@ -25,7 +32,57 @@ def extract_value(file, metrics): return total_step, metric_all -def check_result(case_name, base_path, cur_path, check_metric): +def _split_memory_segments(values: np.ndarray) -> list[np.ndarray]: + if len(values) < MEMORY_GRADIENT_MIN_SEGMENT_LEN: + return [values] + + segments: list[np.ndarray] = [] + start = 0 + for idx in range(1, len(values)): + dropped = values[idx - 1] - values[idx] + if dropped >= MEMORY_GRADIENT_RESUME_DROP_GB: + if idx - start >= MEMORY_GRADIENT_MIN_SEGMENT_LEN: + segments.append(values[start:idx]) + start = idx + if len(values) - start >= MEMORY_GRADIENT_MIN_SEGMENT_LEN: + segments.append(values[start:]) + return segments or [values] + + +def detect_memory_upward_gradient(values: list[float]) -> tuple[bool, str]: + """Detect sustained upward memory drift (possible leak) in the current + run.""" + if len(values) <= MEMORY_GRADIENT_WARMUP_STEPS + MEMORY_GRADIENT_MIN_SEGMENT_LEN: + return False, "" + + series = np.asarray(values[MEMORY_GRADIENT_WARMUP_STEPS:], dtype=float) + + for seg_idx, segment in enumerate(_split_memory_segments(series)): + if len(segment) < MEMORY_GRADIENT_MIN_SEGMENT_LEN: + continue + + deltas = np.diff(segment) + positive_ratio = float(np.mean(deltas > 1e-4)) + x = np.arange(len(segment)) + slope, _ = np.polyfit(x, segment, 1) + mean_val = float(np.mean(segment)) + if mean_val < 1e-10: + continue + + relative_drift = float(slope * (len(segment) - 1) / mean_val) + slope_rising = slope > MEMORY_GRADIENT_MIN_SLOPE_GB + mostly_increasing = positive_ratio >= MEMORY_GRADIENT_POSITIVE_RATIO + drift_too_large = relative_drift > MEMORY_GRADIENT_MIN_REL_DRIFT + + if slope_rising and mostly_increasing and drift_too_large: + return True, ( + f"segment {seg_idx}: slope={slope:.6f} GB/step, " + f"relative_drift={relative_drift:.4f}, positive_ratio={positive_ratio:.2f}" + ) + return False, "" + + +def check_result(case_name, base_path, cur_path, check_metric, phase=None): fail_metric = {} metric_list = list(check_metric.keys()) base_steps, base_metrics = extract_value(base_path, metric_list) @@ -34,7 +91,7 @@ def check_result(case_name, base_path, cur_path, check_metric): f"current steps is not equal to base steps, current steps: {cur_steps}, base steps: {base_steps}" ) - publish_comparison_report(case_name, check_metric, base_metrics, cur_metrics, base_path, cur_path) + publish_comparison_report(case_name, check_metric, base_metrics, cur_metrics, base_path, cur_path, phase=phase) for metric, threshold in check_metric.items(): max_error = 0.0 @@ -42,13 +99,20 @@ def check_result(case_name, base_path, cur_path, check_metric): check_flag = True if metric == "runtime_info/tgs": if cur_steps > 10: - relative_errors = abs(np.array(base_metrics[metric][10:-1]) - np.array(cur_metrics[metric][10:-1])) / ( - np.array(base_metrics[metric][10:-1]) + base_vals = np.array(base_metrics[metric][10:-1], dtype=float) + cur_vals = np.array(cur_metrics[metric][10:-1], dtype=float) + degradation = np.zeros_like(base_vals, dtype=float) + valid_base = np.abs(base_vals) >= 1e-10 + degradation[valid_base] = np.maximum( + (base_vals[valid_base] - cur_vals[valid_base]) / np.abs(base_vals[valid_base]), + 0.0, ) - max_error = np.percentile(relative_errors, 80) + max_error = float(np.percentile(degradation, 80)) if max_error > threshold: fail_metric[metric] = ( - f"{metric} relative error bigger than {threshold} after 10 step, baseline: {base_metrics[metric][10:-1]}, now: {cur_metrics[metric][10:-1]}, relative error: {relative_errors}" + f"{metric} degradation bigger than {threshold} after step 10, " + f"baseline: {base_metrics[metric][10:-1]}, now: {cur_metrics[metric][10:-1]}, " + f"degradation: {degradation.tolist()}" ) check_flag = False else: @@ -56,6 +120,28 @@ def check_result(case_name, base_path, cur_path, check_metric): else: logger.warning("It's meaningless to compare tgs because of the small steps.") check_flag = False + elif metric == "memory/max_memory_GB": + for idx, (old, cur) in enumerate(zip(base_metrics[metric], cur_metrics[metric])): + if abs(old) < 1e-10: + relative_error = float("inf") if abs(cur) > 1e-10 else 0.0 + else: + relative_error = round(abs(old - cur) / abs(old), 2) + if relative_error > max_error: + max_error = relative_error + max_error_idx = idx + if relative_error > threshold: + fail_metric[metric] = ( + f"{metric} relative error bigger than {threshold} in {idx} steps, " + f"baseline: {old:.6f}, now: {cur:.6f}, relative error: {relative_error}" + ) + check_flag = False + break + + if check_flag: + has_gradient, gradient_info = detect_memory_upward_gradient(cur_metrics[metric]) + if has_gradient: + fail_metric[metric] = f"{metric} shows sustained upward gradient in current run, {gradient_info}" + check_flag = False else: for idx, (old, cur) in enumerate(zip(base_metrics[metric], cur_metrics[metric])): if abs(old) < 1e-10: @@ -82,7 +168,7 @@ def check_result(case_name, base_path, cur_path, check_metric): return result, f"Some metric check failed: {fail_metric}" -def check_rl_result(case_name, base_path, cur_path, assert_info): +def check_rl_result(case_name, base_path, cur_path, assert_info, phase=None): fail_metric = {} check_metrics_list = assert_info["check_metrics"] @@ -96,7 +182,9 @@ def check_rl_result(case_name, base_path, cur_path, assert_info): ) check_metric_dict = {item["metric"]: item["threshold"] for item in check_metrics_list} - publish_comparison_report(case_name, check_metric_dict, base_metrics, cur_metrics, base_path, cur_path) + publish_comparison_report( + case_name, check_metric_dict, base_metrics, cur_metrics, base_path, cur_path, phase=phase + ) for config in check_metrics_list: metric = config["metric"] diff --git a/autotest/utils/metric_report.py b/autotest/utils/metric_report.py index 37e790e1b..58e1ef5bd 100644 --- a/autotest/utils/metric_report.py +++ b/autotest/utils/metric_report.py @@ -21,12 +21,16 @@ def get_report_dir() -> Path: return report_dir -def report_image_url(case_name: str) -> str: +def report_suffix_from_phase(phase: str | None) -> str: + return "_resume" if phase == "resume" else "" + + +def report_image_url(case_name: str, report_suffix: str = "") -> str: run_id = os.environ.get("GITHUB_RUN_ID", "0") raw_base = os.environ.get("CI_REPORTS_RAW_URL_BASE", "").rstrip("/") or DEFAULT_RAW_URL_BASE device = os.environ.get("DEVICE", "") prefix = f"{raw_base}/npu" if device == "npu" else raw_base - return f"{prefix}/{run_id}/{case_name}_comparison.png" + return f"{prefix}/{run_id}/{case_name}{report_suffix}_comparison.png" def plot_comparison( @@ -35,6 +39,7 @@ def plot_comparison( base_metrics: dict, cur_metrics: dict, output_root: Path, + report_suffix: str = "", ) -> Path: metric_list = list(metric_keys.keys()) n_plots = len(metric_list) @@ -72,9 +77,10 @@ def plot_comparison( else: ax.axis("off") - fig.suptitle(f"{case_name}_metrics_comparison", fontsize=16) + title = f"{case_name}{report_suffix}_metrics_comparison" + fig.suptitle(title, fontsize=16) plt.tight_layout() - output_path = output_root / f"{case_name}_comparison.png" + output_path = output_root / f"{case_name}{report_suffix}_comparison.png" plt.savefig(output_path, dpi=100, bbox_inches="tight") plt.close() return output_path @@ -118,11 +124,13 @@ def format_jsonl_preview(path: str, label: str) -> str: return md -def append_case_to_step_summary(case_name: str, base_jsonl: str, cur_jsonl: str) -> None: +def append_case_to_step_summary(case_name: str, base_jsonl: str, cur_jsonl: str, report_suffix: str = "") -> None: summary_file = os.environ.get("GITHUB_STEP_SUMMARY", "./tmp.md") - image_url = report_image_url(case_name) + image_url = report_image_url(case_name, report_suffix) + phase_label = "resume" if report_suffix == "_resume" else None + title = f"{case_name} ({phase_label})" if phase_label else case_name with open(summary_file, "a", encoding="utf-8") as f: - f.write(f"## {case_name} 指标比较图\n") + f.write(f"## {title} 指标比较图\n") f.write('
\n') f.write(f'\n') @@ -146,10 +154,12 @@ def publish_comparison_report( cur_metrics: dict, base_jsonl: str, cur_jsonl: str, + phase: str | None = None, ) -> Path: """Write comparison PNG under ``{GITHUB_RUN_ID}/`` and append job summary.""" + report_suffix = report_suffix_from_phase(phase) output_root = get_report_dir() - plot_path = plot_comparison(case_name, metric_keys, base_metrics, cur_metrics, output_root) - append_case_to_step_summary(case_name, base_jsonl, cur_jsonl) + plot_path = plot_comparison(case_name, metric_keys, base_metrics, cur_metrics, output_root, report_suffix) + append_case_to_step_summary(case_name, base_jsonl, cur_jsonl, report_suffix) return plot_path From 4281e01821865a770950e08465ad35681727f31e Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Fri, 12 Jun 2026 10:20:27 +0800 Subject: [PATCH 05/11] update --- autotest/module/train.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/autotest/module/train.py b/autotest/module/train.py index ff2668e8e..1ed84bbac 100644 --- a/autotest/module/train.py +++ b/autotest/module/train.py @@ -7,7 +7,7 @@ from utils.run_cmd import run_cmd -FIRST_RUN_TRACKER_SNAPSHOT = "_first_run_tracker.jsonl" +FIRST_RUN_TRACKER_SNAPSHOT = "first_run_tracker.jsonl" class Train: @@ -83,7 +83,11 @@ def validate(config): phase = config.get("phase") context = config.get("context", {}) - cur_path = resolve_tracker_path(work_dir, train_type, phase, context=context) + run_id = config.get("run_id") + + cur_path = resolve_tracker_path( + work_dir, train_type, phase, context=context, run_id=run_id, case_name=case_name + ) if train_type == "sft": check_metrics = config.get("assert_info", {}).get("check_metrics", {}) @@ -95,7 +99,7 @@ def validate(config): print("Unknown type: {train_type}") return False - snapshot_first_run_tracker(work_dir, phase, cur_path, context=context) + snapshot_first_run_tracker(run_id, case_name, train_type, phase, cur_path, context=context) return result def pre_action(config=None): @@ -131,8 +135,9 @@ def _tracker_path(exp_dir: str | None, train_type: str) -> str: return os.path.join(exp_dir, _tracker_relpath(train_type)) -def _snapshot_path(work_dir: str) -> str: - return os.path.join(work_dir, FIRST_RUN_TRACKER_SNAPSHOT) +def _snapshot_path(run_id: str | None, case_name: str, train_type: str) -> str: + snapshot_dir = os.path.join(os.getcwd(), str(run_id or "0"), ".snapshots", case_name, train_type) + return os.path.join(snapshot_dir, FIRST_RUN_TRACKER_SNAPSHOT) def _write_first_run_segment(src: str, dst: str) -> None: @@ -163,12 +168,16 @@ def resolve_tracker_path( train_type: str, phase: str | None, context: dict[str, Any] | None = None, + run_id: str | None = None, + case_name: str | None = None, ) -> str: context = context or {} - snapshot = context.get("first_run_tracker") or _snapshot_path(work_dir) + snapshot = context.get("first_run_tracker") + if snapshot is None and case_name is not None: + snapshot = _snapshot_path(run_id, case_name, train_type) if phase == "first": - if os.path.isfile(snapshot): + if snapshot and os.path.isfile(snapshot): return snapshot subdirs = list_timestamp_subdirs(work_dir) @@ -178,7 +187,7 @@ def resolve_tracker_path( exp_dir = os.path.join(work_dir, subdirs[-1]) if subdirs else None live_tracker = _tracker_path(exp_dir, train_type) - if os.path.isfile(live_tracker) and _has_duplicate_steps(live_tracker): + if snapshot and os.path.isfile(live_tracker) and _has_duplicate_steps(live_tracker): _write_first_run_segment(live_tracker, snapshot) if os.path.isfile(snapshot) and os.path.getsize(snapshot) > 0: return snapshot @@ -190,14 +199,17 @@ def resolve_tracker_path( def snapshot_first_run_tracker( - work_dir: str, + run_id: str | None, + case_name: str, + train_type: str, phase: str | None, cur_path: str, context: dict[str, Any] | None = None, ) -> None: if phase != "first" or not os.path.isfile(cur_path): return - snapshot = _snapshot_path(work_dir) + snapshot = _snapshot_path(run_id, case_name, train_type) + os.makedirs(os.path.dirname(snapshot), exist_ok=True) if cur_path != snapshot: shutil.copy2(cur_path, snapshot) if context is not None: From 1d7c82929c5db98ad00a06784a3ae606885072ec Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Fri, 12 Jun 2026 17:21:27 +0800 Subject: [PATCH 06/11] updsate --- autotest/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotest/config.yaml b/autotest/config.yaml index 0e31b3bbd..ee934eea4 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -752,7 +752,7 @@ case: operator: <= - metric: response/response_len/mean - threshold: 0.12 + threshold: 0.15 method: relative operator: < - From 7f98c3621384116b4f474c2bddb43ea4ec0479ca Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Tue, 16 Jun 2026 14:42:23 +0800 Subject: [PATCH 07/11] update --- autotest/cluster/clusterx.py | 84 +++++++++++++++++++++++++++++++++--- 1 file changed, 79 insertions(+), 5 deletions(-) diff --git a/autotest/cluster/clusterx.py b/autotest/cluster/clusterx.py index 688642584..e6ff04e83 100644 --- a/autotest/cluster/clusterx.py +++ b/autotest/cluster/clusterx.py @@ -1,10 +1,15 @@ +import re import time import traceback from typing import Any, Dict, Optional from clusterx.config import CLUSTER from clusterx.launcher import CLUSTER_MAPPING -from clusterx.launcher.base import JobStatus +from clusterx.launcher.base import JobSchema, JobStatus + + +JOB_LOOKUP_RETRY_INTERVAL_S = 5 +JOB_LOOKUP_RETRY_TIMES = 6 class ClusterTaskExecutor: @@ -36,9 +41,9 @@ def execute_task(self, task_config: Dict[str, Any]): all_command.append(command) run_command = "; ".join(all_command) + job_name = "-".join([task_config["type"], task_config["case_name"], task_config["run_id"]]) try: - job_name = "-".join([task_config["type"], task_config["case_name"], task_config["run_id"]]) params = self.params_cls( job_name=job_name, cmd=run_command, @@ -50,13 +55,22 @@ def execute_task(self, task_config: Dict[str, Any]): num_nodes=resource.get("num_nodes", 1), image=resource.get("image", None), no_env=resource.get("no_env", True), - image_pull_policy=resource.get("image_pull_policy","Always"), + image_pull_policy=resource.get("image_pull_policy", "Always"), ) job_schema = self.cluster.run(params) except Exception as e: traceback.print_exc() - raise RuntimeError(f"clusterx job {job_name} start fail, task config is {task_config}, exception is: {e}") + job_schema = self._lookup_job_schema(job_name) + if job_schema is None: + raise RuntimeError( + f"clusterx job {job_name} start fail and lookup found no matching job, " + f"task config is {task_config}, exception is: {e}" + ) + print( + f"clusterx job {job_name} submit error recovered via lookup: " + f"job_id={job_schema.job_id}, status={job_schema.status}, original exception: {e}" + ) start_time = time.time() run_start_time = None @@ -68,7 +82,7 @@ def execute_task(self, task_config: Dict[str, Any]): if status in [JobStatus.SUCCEEDED]: run_time = time.time() - run_start_time if run_time >= timeout: - return False, f'Task succeeded, but run time is {run_time}, exceeding then {timeout}' + return False, f"Task succeeded, but run time is {run_time}, exceeding then {timeout}" else: return True, "Task succeeded" elif status in [JobStatus.FAILED, JobStatus.STOPPED]: @@ -91,6 +105,66 @@ def execute_task(self, task_config: Dict[str, Any]): ) time.sleep(10) + @staticmethod + def _job_name_matches(candidate: str | None, job_name: str) -> bool: + if not candidate: + return False + return candidate == job_name or candidate.startswith(f"{job_name}-") + + def _pick_latest_job(self, jobs: list[JobSchema]) -> JobSchema: + return max(jobs, key=lambda job: job.job_id or job.job_name or "") + + def _lookup_job_schema_once(self, job_name: str) -> JobSchema | None: + try: + return self.cluster.get_job_info(job_name) + except Exception: + pass + + name_regex = rf"^{re.escape(job_name)}(-.*)?$" + try: + jobs = self.cluster.list_jobs(regex=name_regex, num=50) + if jobs: + return self._pick_latest_job(jobs) + except Exception as e: + print(f"list_jobs lookup for {job_name} failed: {e}") + + client = getattr(self.cluster, "client", None) + get_job_name = getattr(self.cluster, "_get_job_name", None) + if client is not None and get_job_name is not None: + try: + matched_names = [ + get_job_name(job) + for job in (client.list() or []) + if self._job_name_matches(get_job_name(job), job_name) + ] + if matched_names: + return self.cluster.get_job_info(max(matched_names)) + except Exception as e: + print(f"brainpp client list lookup for {job_name} failed: {e}") + + try: + jobs = self.cluster.list_jobs(num=100) + matched = [job for job in jobs if self._job_name_matches(job.job_id, job_name)] + if matched: + return self._pick_latest_job(matched) + except Exception as e: + print(f"generic list_jobs lookup for {job_name} failed: {e}") + + return None + + def _lookup_job_schema(self, job_name: str) -> JobSchema | None: + for attempt in range(1, JOB_LOOKUP_RETRY_TIMES + 1): + job_schema = self._lookup_job_schema_once(job_name) + if job_schema is not None: + return job_schema + if attempt < JOB_LOOKUP_RETRY_TIMES: + print( + f"Job {job_name} not found on attempt {attempt}/{JOB_LOOKUP_RETRY_TIMES}, " + f"retry in {JOB_LOOKUP_RETRY_INTERVAL_S}s" + ) + time.sleep(JOB_LOOKUP_RETRY_INTERVAL_S) + return None + def get_task_status(self, job_id: str) -> Optional[JobStatus]: try: status = self.cluster.get_job_info(job_id).status From 8ccca84a2499f93593d2973cb9803644abe31592 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Tue, 16 Jun 2026 19:19:05 +0800 Subject: [PATCH 08/11] update --- autotest/config/qwen3_fp8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotest/config/qwen3_fp8.py b/autotest/config/qwen3_fp8.py index 1147ae16e..132db8a38 100644 --- a/autotest/config/qwen3_fp8.py +++ b/autotest/config/qwen3_fp8.py @@ -21,7 +21,7 @@ scaling_granularity_grouped_gemm=ScalingGranularity.TILEWISE, ) -moe_cfg = Qwen3MoE30BA3Config(float8_cfg=float8_cfg, ep_size=8, compile_cfg=False) +moe_cfg = Qwen3MoE30BA3Config(float8_cfg=float8_cfg, ep_size=8, compile_cfg=False, balancing_loss_cfg=None) optim_cfg = AdamWConfig(lr=6e-05) lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6) fsdp_cfg = FSDPConfig( From 67dab857805a461d48ed127cc16dea30c5b72f14 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Wed, 17 Jun 2026 13:51:10 +0800 Subject: [PATCH 09/11] update --- autotest/config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autotest/config.yaml b/autotest/config.yaml index ee934eea4..9e3b0c9eb 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -848,7 +848,7 @@ case: threshold: 20 method: absolute operator: < - timeout: 7200 + timeout: 9000 qwen3-5-rl-vl-lmdeploy-dapo: - @@ -892,7 +892,7 @@ case: threshold: 20 method: absolute operator: < - timeout: 7200 + timeout: 9000 qwen3-5-rl-vl-lmdeploy-resume: - From abe58f4294431799e3e167bddca2ee42d26d7334 Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Mon, 22 Jun 2026 11:09:37 +0800 Subject: [PATCH 10/11] Update qwen3_fp8.py --- autotest/config/qwen3_fp8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotest/config/qwen3_fp8.py b/autotest/config/qwen3_fp8.py index 132db8a38..1147ae16e 100644 --- a/autotest/config/qwen3_fp8.py +++ b/autotest/config/qwen3_fp8.py @@ -21,7 +21,7 @@ scaling_granularity_grouped_gemm=ScalingGranularity.TILEWISE, ) -moe_cfg = Qwen3MoE30BA3Config(float8_cfg=float8_cfg, ep_size=8, compile_cfg=False, balancing_loss_cfg=None) +moe_cfg = Qwen3MoE30BA3Config(float8_cfg=float8_cfg, ep_size=8, compile_cfg=False) optim_cfg = AdamWConfig(lr=6e-05) lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6) fsdp_cfg = FSDPConfig( From cb18f3510c4f3ba62f92c36d12f7c2ad262dcb22 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Tue, 23 Jun 2026 11:35:53 +0800 Subject: [PATCH 11/11] update --- autotest/config.yaml | 40 +++------- autotest/utils/check_metric.py | 130 ++++++++++++++++++++++++++++++--- 2 files changed, 130 insertions(+), 40 deletions(-) diff --git a/autotest/config.yaml b/autotest/config.yaml index 9e3b0c9eb..fec0137cb 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -823,11 +823,6 @@ case: assert_info: base_metric: qwen3-5-rl-vl-lmdeploy-grpo/tracker.jsonl check_metrics: - - - metric: eval/accuracy - threshold: 0.1 - method: absolute - operator: < - metric: response/rewards/mean threshold: 0.3 @@ -845,9 +840,10 @@ case: operator: < - metric: time/step - threshold: 20 - method: absolute + threshold: 0.15 + method: relative operator: < + aggregate: 80 timeout: 9000 qwen3-5-rl-vl-lmdeploy-dapo: @@ -867,11 +863,6 @@ case: assert_info: base_metric: qwen3-5-rl-vl-lmdeploy-dapo/tracker.jsonl check_metrics: - - - metric: eval/accuracy - threshold: 0.1 - method: absolute - operator: < - metric: response/rewards/mean threshold: 0.3 @@ -889,9 +880,10 @@ case: operator: < - metric: time/step - threshold: 20 - method: absolute + threshold: 0.15 + method: relative operator: < + aggregate: 80 timeout: 9000 qwen3-5-rl-vl-lmdeploy-resume: @@ -913,11 +905,6 @@ case: assert_info: base_metric: qwen3-5-rl-vl-lmdeploy-resume/tracker.jsonl check_metrics: - - - metric: eval/accuracy - threshold: 0.1 - method: absolute - operator: < - metric: response/rewards/mean threshold: 0.3 @@ -935,9 +922,10 @@ case: operator: < - metric: time/step - threshold: 20 - method: absolute + threshold: 0.15 + method: relative operator: < + aggregate: 80 timeout: 7200 - @@ -960,11 +948,6 @@ case: assert_info: base_metric: qwen3-5-rl-vl-lmdeploy-resume/tracker-resume.jsonl check_metrics: - - - metric: eval/accuracy - threshold: 0.1 - method: absolute - operator: < - metric: response/rewards/mean threshold: 0.3 @@ -982,9 +965,10 @@ case: operator: < - metric: time/step - threshold: 20 - method: absolute + threshold: 0.15 + method: relative operator: < + aggregate: 80 timeout: 7200 qwen3-5-rl-lmdeploy-dapo: diff --git a/autotest/utils/check_metric.py b/autotest/utils/check_metric.py index fe0a0be12..c38368845 100644 --- a/autotest/utils/check_metric.py +++ b/autotest/utils/check_metric.py @@ -17,6 +17,13 @@ MEMORY_GRADIENT_MIN_REL_DRIFT = 0.00015 MEMORY_GRADIENT_RESUME_DROP_GB = 0.005 +# RL tracker lines: mini-batch logs vs per-RL-step summary (see rl_trainer._log_step). +RL_STEP_SUMMARY_MARKER = "response/rewards/mean" +RL_PERCENTILE_METRICS: dict[str, int] = { + "response/response_len/mean": 80, + "response/rewards/mean": 80, +} + def extract_value(file, metrics): metric_all = {metric: [] for metric in metrics} @@ -32,6 +39,73 @@ def extract_value(file, metrics): return total_step, metric_all +def extract_rl_value(file, metrics): + """Extract metrics from RL step-summary lines only (ignore mini-batch + rows).""" + metric_all = {metric: [] for metric in metrics} + total_step = 0 + with open(file) as f: + for line in f: + record = json.loads(line) + if RL_STEP_SUMMARY_MARKER not in record: + continue + total_step += 1 + for metric in metrics: + if metric in record: + metric_all[metric].append(record[metric]) + return total_step, metric_all + + +def _step_errors(base_vals: list[float], cur_vals: list[float], method: str) -> list[float]: + errors: list[float] = [] + for base_val, cur_val in zip(base_vals, cur_vals): + if method == "absolute": + errors.append(abs(cur_val - base_val)) + elif method == "relative": + if abs(base_val) < 1e-10: + errors.append(float("inf") if abs(cur_val) > 1e-10 else 0.0) + else: + errors.append(abs(cur_val - base_val) / abs(base_val)) + else: + raise ValueError(f"Unknown method: {method}") + return errors + + +def _percentile_error_passes( + base_vals: list[float], + cur_vals: list[float], + *, + method: str, + threshold: float, + operator: str, + percentile: int, +) -> tuple[bool, float, str]: + errors = _step_errors(base_vals, cur_vals, method) + agg_error = float(np.percentile(errors, percentile)) + if operator == "<": + passed = agg_error < threshold + elif operator == "<=": + passed = agg_error <= threshold + else: + raise ValueError(f"Unknown operator: {operator}") + detail = f"p{percentile}={agg_error:.6f} (max={max(errors):.6f})" + return passed, agg_error, detail + + +def _format_rl_metric_failure( + metric: str, + *, + method: str, + operator: str, + threshold: float, + detail: str, +) -> str: + return ( + f"{metric} aggregated error does not satisfy threshold {threshold} " + f"(method: {method}, operator: {operator}, {detail})" + ) + + def _split_memory_segments(values: np.ndarray) -> list[np.ndarray]: if len(values) < MEMORY_GRADIENT_MIN_SEGMENT_LEN: return [values] @@ -174,11 +248,11 @@ def check_rl_result(case_name, base_path, cur_path, assert_info, phase=None): metric_list = [item["metric"] for item in check_metrics_list] - base_steps, base_metrics = extract_value(base_path, metric_list) - cur_steps, cur_metrics = extract_value(cur_path, metric_list) + base_steps, base_metrics = extract_rl_value(base_path, metric_list) + cur_steps, cur_metrics = extract_rl_value(cur_path, metric_list) assert cur_steps == base_steps, ( - f"current steps is not equal to base steps, current steps: {cur_steps}, base steps: {base_steps}" + f"current RL steps is not equal to base RL steps, current steps: {cur_steps}, base steps: {base_steps}" ) check_metric_dict = {item["metric"]: item["threshold"] for item in check_metrics_list} @@ -191,21 +265,53 @@ def check_rl_result(case_name, base_path, cur_path, assert_info, phase=None): threshold = config["threshold"] method = config["method"] operator = config["operator"] + percentile = config.get("aggregate") + if percentile is None and metric in RL_PERCENTILE_METRICS: + percentile = RL_PERCENTILE_METRICS[metric] + + base_vals = base_metrics[metric] + cur_vals = cur_metrics[metric] + if not base_vals and not cur_vals: + logger.warning(f"Skip {metric}: absent in both baseline and current RL step summaries.") + continue + if len(base_vals) != len(cur_vals): + fail_metric[metric] = ( + f"{metric} step count mismatch after RL step-summary extraction: " + f"baseline={len(base_vals)}, current={len(cur_vals)}" + ) + continue max_error = 0.0 max_error_idx = 0 check_flag = True - for idx, (base_val, cur_val) in enumerate(zip(base_metrics[metric], cur_metrics[metric])): - if method == "absolute": - error = round(abs(cur_val - base_val), 5) - elif method == "relative": - if abs(base_val) < 1e-10: - error = float("inf") if abs(cur_val) > 1e-10 else 0.0 - else: - error = round(abs(cur_val - base_val) / abs(base_val), 5) + if percentile is not None: + check_flag, agg_error, detail = _percentile_error_passes( + base_vals, + cur_vals, + method=method, + threshold=threshold, + operator=operator, + percentile=int(percentile), + ) + if not check_flag: + fail_metric[metric] = _format_rl_metric_failure( + metric, + method=method, + operator=operator, + threshold=threshold, + detail=detail, + ) else: - raise ValueError(f"Unknown method: {method}") + logger.info( + f"✓ {metric} check passed ({detail}, method: {method}, operator: {operator}, " + f"threshold: {threshold})" + ) + continue + + for idx, (base_val, cur_val) in enumerate(zip(base_vals, cur_vals)): + errors = _step_errors([base_val], [cur_val], method) + error = round(errors[0], 5) if error > max_error: max_error = error