From 8fbb67cf5a524b12eff4fda0cbcf0149c3ba67b1 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Thu, 11 Jun 2026 14:37:30 +0800
Subject: [PATCH 01/11] update

---
 autotest/config.yaml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/autotest/config.yaml b/autotest/config.yaml
index ba15121ed..6bfd70eae 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -352,6 +352,7 @@ case:
                     - DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/ci_vl
                     - MEDIA_ROOT=/mnt/shared-storage-user/llmrazor-share/data/ci_vl
                     - XTUNER_DETERMINISTIC=true
+                    - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0
             assert_info:
                 base_metric: qwen3-sft-vl-dense/tracker.jsonl
                 check_metrics:
@@ -769,6 +770,7 @@ case:
                     - EVAL_DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/less_geometry3k/test.jsonl
                     - MEDIA_ROOT=/mnt/shared-storage-user/llmrazor-share/data/less_geometry3k
                     - XTUNER_DETERMINISTIC=true
+                    - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0
             assert_info:
                 base_metric: qwen3-rl-vl-lmdeploy/tracker.jsonl
                 check_metrics:
@@ -813,6 +815,7 @@ case:
                     - DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/rl_vl_meta.jsonl
                     - XTUNER_USE_LMDEPLOY=1
                     - XTUNER_DETERMINISTIC=true
+                    - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0
             assert_info:
                 base_metric: qwen3-5-rl-vl-lmdeploy-grpo/tracker.jsonl
                 check_metrics:
@@ -857,6 +860,7 @@ case:
                     - DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/rl_vl_meta.jsonl
                     - XTUNER_USE_LMDEPLOY=1
                     - XTUNER_DETERMINISTIC=true
+                    - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0
             assert_info:
                 base_metric: qwen3-5-rl-vl-lmdeploy-dapo/tracker.jsonl
                 check_metrics:
@@ -902,6 +906,7 @@ case:
                     - XTUNER_USE_LMDEPLOY=1
                     - XTUNER_DETERMINISTIC=true
                     - XTUNER_GC_ENABLE=1
+                    - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0
             assert_info:
                 base_metric: qwen3-5-rl-vl-lmdeploy-resume/tracker.jsonl
                 check_metrics:
@@ -948,6 +953,7 @@ case:
                     - XTUNER_USE_LMDEPLOY=1
                     - XTUNER_DETERMINISTIC=true
                     - XTUNER_GC_ENABLE=1
+                    - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0
             assert_info:
                 base_metric: qwen3-5-rl-vl-lmdeploy-resume/tracker-resume.jsonl
                 check_metrics:
@@ -993,6 +999,7 @@ case:
                     - EVAL_DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/gsm8k/test.jsonl
                     - XTUNER_USE_LMDEPLOY=1
                     - XTUNER_DETERMINISTIC=true
+                    - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0
             assert_info:
                 base_metric: qwen3-5-rl-lmdeploy-dapo/tracker.jsonl
                 check_metrics:
@@ -1041,7 +1048,6 @@ case:
                     - XTUNER_USE_LMDEPLOY=0
                     - XTUNER_USE_VLLM=0
                     - XTUNER_USE_SGLANG=1
-                    - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0
                     - XTUNER_USE_FA3=0
             assert_info:
                 base_metric: qwen3-rl-sglang/tracker.jsonl

From 40046311bd60e914af73f2ebc40274bda775f15a Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Thu, 11 Jun 2026 14:42:51 +0800
Subject: [PATCH 02/11] update

---
 .github/workflows/ete_test_gpu.yaml | 4 ++--
 .github/workflows/ete_test_npu.yaml | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ete_test_gpu.yaml b/.github/workflows/ete_test_gpu.yaml
index 009b22e64..d230206eb 100644
--- a/.github/workflows/ete_test_gpu.yaml
+++ b/.github/workflows/ete_test_gpu.yaml
@@ -65,9 +65,9 @@ jobs:
           unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
           CASE_NAME="${{ github.event.inputs.run_case || 'all' }}"
           if [ "$CASE_NAME" = "all" ]; then
-            pytest autotest/test_all.py -m all -n 1 -vv --run_id ${{ github.run_id }}
+            pytest autotest/test_all.py -m all -n 1 -s -vv --run_id ${{ github.run_id }}
           else
-            pytest autotest/test_all.py::test_all[$CASE_NAME] -m all -n 1 -vv --run_id ${{ github.run_id }}
+            pytest autotest/test_all.py::test_all[$CASE_NAME] -m all -n 1 -s -vv --run_id ${{ github.run_id }}
           fi
 
       - name: Check report files
diff --git a/.github/workflows/ete_test_npu.yaml b/.github/workflows/ete_test_npu.yaml
index 2ca5aaff2..47c2afb6e 100644
--- a/.github/workflows/ete_test_npu.yaml
+++ b/.github/workflows/ete_test_npu.yaml
@@ -61,9 +61,9 @@ jobs:
           unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
           CASE_NAME="${{ github.event.inputs.run_case || 'all' }}"
           if [ "$CASE_NAME" = "all" ]; then
-            export DEVICE=npu && pytest autotest/test_all.py -m all -n 1 -vv --run_id ${{ github.run_id }}
+            export DEVICE=npu && pytest autotest/test_all.py -m all -n 1 -s -vv --run_id ${{ github.run_id }}
           else
-            export DEVICE=npu && pytest autotest/test_all.py::test_all[$CASE_NAME] -m all -n 1 -vv --run_id ${{ github.run_id }}
+            export DEVICE=npu && pytest autotest/test_all.py::test_all[$CASE_NAME] -m all -n 1 -s -vv --run_id ${{ github.run_id }}
           fi
 
       - name: Check report files

From 812be4718dd102eaa4b5c9189720790bd2efc6ee Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Thu, 11 Jun 2026 14:45:42 +0800
Subject: [PATCH 03/11] update

---
 autotest/config.yaml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/autotest/config.yaml b/autotest/config.yaml
index 6bfd70eae..1db4e2296 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -770,7 +770,6 @@ case:
                     - EVAL_DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/less_geometry3k/test.jsonl
                     - MEDIA_ROOT=/mnt/shared-storage-user/llmrazor-share/data/less_geometry3k
                     - XTUNER_DETERMINISTIC=true
-                    - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0
             assert_info:
                 base_metric: qwen3-rl-vl-lmdeploy/tracker.jsonl
                 check_metrics:
@@ -815,7 +814,6 @@ case:
                     - DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/rl_vl_meta.jsonl
                     - XTUNER_USE_LMDEPLOY=1
                     - XTUNER_DETERMINISTIC=true
-                    - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0
             assert_info:
                 base_metric: qwen3-5-rl-vl-lmdeploy-grpo/tracker.jsonl
                 check_metrics:
@@ -860,7 +858,6 @@ case:
                     - DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/rl_vl_meta.jsonl
                     - XTUNER_USE_LMDEPLOY=1
                     - XTUNER_DETERMINISTIC=true
-                    - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0
             assert_info:
                 base_metric: qwen3-5-rl-vl-lmdeploy-dapo/tracker.jsonl
                 check_metrics:
@@ -906,7 +903,6 @@ case:
                     - XTUNER_USE_LMDEPLOY=1
                     - XTUNER_DETERMINISTIC=true
                     - XTUNER_GC_ENABLE=1
-                    - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0
             assert_info:
                 base_metric: qwen3-5-rl-vl-lmdeploy-resume/tracker.jsonl
                 check_metrics:
@@ -953,7 +949,6 @@ case:
                     - XTUNER_USE_LMDEPLOY=1
                     - XTUNER_DETERMINISTIC=true
                     - XTUNER_GC_ENABLE=1
-                    - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0
             assert_info:
                 base_metric: qwen3-5-rl-vl-lmdeploy-resume/tracker-resume.jsonl
                 check_metrics:
@@ -999,7 +994,6 @@ case:
                     - EVAL_DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/gsm8k/test.jsonl
                     - XTUNER_USE_LMDEPLOY=1
                     - XTUNER_DETERMINISTIC=true
-                    - TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=0
             assert_info:
                 base_metric: qwen3-5-rl-lmdeploy-dapo/tracker.jsonl
                 check_metrics:

From 9f4d171a6e782d04b7eedb8f8ed56a7c85af2d2e Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Thu, 11 Jun 2026 16:41:14 +0800
Subject: [PATCH 04/11] add new matric and fix repeated png

---
 autotest/config-npu.yaml        |   2 +
 autotest/config.yaml            |   8 +++
 autotest/module/train.py        | 124 +++++++++++++++++++++++++++-----
 autotest/utils/check_metric.py  | 104 ++++++++++++++++++++++++---
 autotest/utils/metric_report.py |  28 +++++---
 5 files changed, 233 insertions(+), 33 deletions(-)

diff --git a/autotest/config-npu.yaml b/autotest/config-npu.yaml
index 5661e6638..8a86d3720 100644
--- a/autotest/config-npu.yaml
+++ b/autotest/config-npu.yaml
@@ -56,6 +56,7 @@ case:
     npu-qwen3-sft-ep8:
         -
             type: sft
+            phase: first
             parameters:
                 config: autotest/config/npu_qwen3_moe_30BA3_ep8.py
                 output_path: /mnt/hwfile/llmrazor/qa-llm-cicd/test_output
@@ -80,6 +81,7 @@ case:
             timeout: 10800
         -
             type: sft
+            phase: resume
             pre_action:
                 command: 'python ./autotest/utils/update_meta.py /mnt/hwfile/llmrazor/qa-llm-cicd/test_output npu-qwen3-sft-ep8 sft'
             parameters:
diff --git a/autotest/config.yaml b/autotest/config.yaml
index 1db4e2296..0e31b3bbd 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -54,6 +54,7 @@ case:
     qwen3-sft-ep8:
         -
             type: sft
+            phase: first
             parameters:
                 config: autotest/config/qwen3_moe_30BA3_ep8.py
                 output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
@@ -79,6 +80,7 @@ case:
             timeout: 1500
         -
             type: sft
+            phase: resume
             pre_action:
                 command: 'python ./autotest/utils/update_meta.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-sft-ep8 sft'
             parameters:
@@ -475,6 +477,7 @@ case:
     qwen3-5-sft-sp4-resume:
         -
             type: sft
+            phase: first
             parameters:
                 config: autotest/config/qwen3_5_moe_30BA3_sp4.py
                 output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
@@ -499,6 +502,7 @@ case:
 
         -
             type: sft
+            phase: resume
             pre_action:
                 command: 'python ./autotest/utils/update_meta.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-5-sft-sp4-resume sft'
             parameters:
@@ -608,6 +612,7 @@ case:
     qwen3-5-sft-sp4-resume-vl:
         -
             type: sft
+            phase: first
             parameters:
                 config: autotest/config/qwen3_5_moe_30BA3_sp4_vl.py
                 output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
@@ -634,6 +639,7 @@ case:
 
         -
             type: sft
+            phase: resume
             pre_action:
                 command: 'python ./autotest/utils/update_meta.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-5-sft-sp4-resume-vl sft'
             parameters:
@@ -891,6 +897,7 @@ case:
     qwen3-5-rl-vl-lmdeploy-resume:
         -
             type: rl
+            phase: first
             parameters:
                 config: autotest/config/rl_qwen3p5_vl_35B_dapo_ep2_resume.py
                 infer_backend: lmdeploy
@@ -935,6 +942,7 @@ case:
 
         -
             type: rl
+            phase: resume
             pre_action:
                 command: 'python ./autotest/utils/update_meta.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-5-rl-vl-lmdeploy-resume rl'
             parameters:
diff --git a/autotest/module/train.py b/autotest/module/train.py
index 861221513..ff2668e8e 100644
--- a/autotest/module/train.py
+++ b/autotest/module/train.py
@@ -1,16 +1,22 @@
+import json
 import os
+import shutil
+from typing import Any
 
 from utils.check_metric import check_result, check_rl_result
 from utils.run_cmd import run_cmd
 
 
+FIRST_RUN_TRACKER_SNAPSHOT = "_first_run_tracker.jsonl"
+
+
 class Train:
     def get_cmd(config):
         print(config)
         config_path = config.get("parameters").get("config")
         train_type = config.get("type")
         nproc_per_node = config.get("resource", {}).get("gpus_per_task", 8)
-        pip_package = config.get("resource", {}).get("pip_package", 'ls')
+        pip_package = config.get("resource", {}).get("pip_package", "ls")
         if train_type in ["sft", "rl"]:
             model_config = config.get("parameters", {}).get("model", None)
             config_path = config.get("parameters", {}).get("config", None)
@@ -70,22 +76,28 @@ def get_cmd(config):
 
     def validate(config):
         work_dir = config.get("work_dir", None)
-        base_path = os.path.join(
-            config.get("base_path").get("base_baseline_path"), config.get("assert_info", {}).get("base_metric", None)
-        )
+        base_metric = config.get("assert_info", {}).get("base_metric", None)
+        base_path = os.path.join(config.get("base_path").get("base_baseline_path"), base_metric)
         train_type = config.get("type")
+        case_name = config["case_name"]
+        phase = config.get("phase")
+        context = config.get("context", {})
+
+        cur_path = resolve_tracker_path(work_dir, train_type, phase, context=context)
+
         if train_type == "sft":
-            cur_path = os.path.join(get_latest_subdir(work_dir), "logs/exp_tracking/rank0/tracker.jsonl")
             check_metrics = config.get("assert_info", {}).get("check_metrics", {})
-            return check_result(config["case_name"], base_path, cur_path, check_metrics)
+            result = check_result(case_name, base_path, cur_path, check_metrics, phase=phase)
         elif train_type == "rl":
-            cur_path = os.path.join(get_latest_subdir(work_dir), "logs/exp_tracking/tracker.jsonl")
             check_metrics = config.get("assert_info", {})
-            return check_rl_result(config["case_name"], base_path, cur_path, check_metrics)
+            result = check_rl_result(case_name, base_path, cur_path, check_metrics, phase=phase)
         else:
             print("Unknown type: {train_type}")
             return False
 
+        snapshot_first_run_tracker(work_dir, phase, cur_path, context=context)
+        return result
+
     def pre_action(config=None):
         action_info = config.get("pre_action", None)
         if action_info:
@@ -101,12 +113,92 @@ def post_action(config=None):
                 run_cmd(action_cmd)
 
 
-def get_latest_subdir(work_dir):
-    dirs = [
-        d for d in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, d)) and len(d) == 14 and d.isdigit()
-    ]
+def list_timestamp_subdirs(work_dir: str) -> list[str]:
+    return sorted(
+        name
+        for name in os.listdir(work_dir)
+        if os.path.isdir(os.path.join(work_dir, name)) and len(name) == 14 and name.isdigit()
+    )
+
+
+def _tracker_relpath(train_type: str) -> str:
+    if train_type == "sft":
+        return "logs/exp_tracking/rank0/tracker.jsonl"
+    return "logs/exp_tracking/tracker.jsonl"
+
+
+def _tracker_path(exp_dir: str | None, train_type: str) -> str:
+    return os.path.join(exp_dir, _tracker_relpath(train_type))
+
+
+def _snapshot_path(work_dir: str) -> str:
+    return os.path.join(work_dir, FIRST_RUN_TRACKER_SNAPSHOT)
+
+
+def _write_first_run_segment(src: str, dst: str) -> None:
+    os.makedirs(os.path.dirname(dst), exist_ok=True)
+    seen_steps: set[Any] = set()
+    with open(src, encoding="utf-8") as fin, open(dst, "w", encoding="utf-8") as fout:
+        for line in fin:
+            if not line.strip():
+                continue
+            step = json.loads(line).get("step")
+            if step in seen_steps:
+                break
+            seen_steps.add(step)
+            fout.write(line if line.endswith("\n") else f"{line}\n")
+
+
+def _has_duplicate_steps(tracker_path: str) -> bool:
+    steps: list[Any] = []
+    with open(tracker_path, encoding="utf-8") as f:
+        for line in f:
+            if line.strip():
+                steps.append(json.loads(line).get("step"))
+    return len(steps) != len(set(steps))
+
+
+def resolve_tracker_path(
+    work_dir: str,
+    train_type: str,
+    phase: str | None,
+    context: dict[str, Any] | None = None,
+) -> str:
+    context = context or {}
+    snapshot = context.get("first_run_tracker") or _snapshot_path(work_dir)
+
+    if phase == "first":
+        if os.path.isfile(snapshot):
+            return snapshot
+
+        subdirs = list_timestamp_subdirs(work_dir)
+        if len(subdirs) > 1:
+            exp_dir = os.path.join(work_dir, subdirs[0])
+        else:
+            exp_dir = os.path.join(work_dir, subdirs[-1]) if subdirs else None
+        live_tracker = _tracker_path(exp_dir, train_type)
+
+        if os.path.isfile(live_tracker) and _has_duplicate_steps(live_tracker):
+            _write_first_run_segment(live_tracker, snapshot)
+            if os.path.isfile(snapshot) and os.path.getsize(snapshot) > 0:
+                return snapshot
+        return live_tracker
+
+    subdirs = list_timestamp_subdirs(work_dir)
+    exp_dir = os.path.join(work_dir, subdirs[-1]) if subdirs else None
+    return _tracker_path(exp_dir, train_type)
+
 
-    if not dirs:
-        return None
-    latest = max(dirs, key=lambda d: os.path.getmtime(os.path.join(work_dir, d)))
-    return os.path.join(work_dir, latest)
+def snapshot_first_run_tracker(
+    work_dir: str,
+    phase: str | None,
+    cur_path: str,
+    context: dict[str, Any] | None = None,
+) -> None:
+    if phase != "first" or not os.path.isfile(cur_path):
+        return
+    snapshot = _snapshot_path(work_dir)
+    if cur_path != snapshot:
+        shutil.copy2(cur_path, snapshot)
+    if context is not None:
+        context["first_run_tracker"] = snapshot
diff --git a/autotest/utils/check_metric.py b/autotest/utils/check_metric.py
index 00814a32e..fe0a0be12 100644
--- a/autotest/utils/check_metric.py
+++ b/autotest/utils/check_metric.py
@@ -10,6 +10,13 @@
 )
 logger = logging.getLogger(__name__)
 
+MEMORY_GRADIENT_WARMUP_STEPS = 5
+MEMORY_GRADIENT_MIN_SEGMENT_LEN = 8
+MEMORY_GRADIENT_POSITIVE_RATIO = 0.65
+MEMORY_GRADIENT_MIN_SLOPE_GB = 1e-4
+MEMORY_GRADIENT_MIN_REL_DRIFT = 0.00015
+MEMORY_GRADIENT_RESUME_DROP_GB = 0.005
+
 
 def extract_value(file, metrics):
     metric_all = {metric: [] for metric in metrics}
@@ -25,7 +32,57 @@ def extract_value(file, metrics):
     return total_step, metric_all
 
 
-def check_result(case_name, base_path, cur_path, check_metric):
+def _split_memory_segments(values: np.ndarray) -> list[np.ndarray]:
+    if len(values) < MEMORY_GRADIENT_MIN_SEGMENT_LEN:
+        return [values]
+
+    segments: list[np.ndarray] = []
+    start = 0
+    for idx in range(1, len(values)):
+        dropped = values[idx - 1] - values[idx]
+        if dropped >= MEMORY_GRADIENT_RESUME_DROP_GB:
+            if idx - start >= MEMORY_GRADIENT_MIN_SEGMENT_LEN:
+                segments.append(values[start:idx])
+            start = idx
+    if len(values) - start >= MEMORY_GRADIENT_MIN_SEGMENT_LEN:
+        segments.append(values[start:])
+    return segments or [values]
+
+
+def detect_memory_upward_gradient(values: list[float]) -> tuple[bool, str]:
+    """Detect sustained upward memory drift (possible leak) in the current
+    run."""
+    if len(values) <= MEMORY_GRADIENT_WARMUP_STEPS + MEMORY_GRADIENT_MIN_SEGMENT_LEN:
+        return False, ""
+
+    series = np.asarray(values[MEMORY_GRADIENT_WARMUP_STEPS:], dtype=float)
+
+    for seg_idx, segment in enumerate(_split_memory_segments(series)):
+        if len(segment) < MEMORY_GRADIENT_MIN_SEGMENT_LEN:
+            continue
+
+        deltas = np.diff(segment)
+        positive_ratio = float(np.mean(deltas > 1e-4))
+        x = np.arange(len(segment))
+        slope, _ = np.polyfit(x, segment, 1)
+        mean_val = float(np.mean(segment))
+        if mean_val < 1e-10:
+            continue
+
+        relative_drift = float(slope * (len(segment) - 1) / mean_val)
+        slope_rising = slope > MEMORY_GRADIENT_MIN_SLOPE_GB
+        mostly_increasing = positive_ratio >= MEMORY_GRADIENT_POSITIVE_RATIO
+        drift_too_large = relative_drift > MEMORY_GRADIENT_MIN_REL_DRIFT
+
+        if slope_rising and mostly_increasing and drift_too_large:
+            return True, (
+                f"segment {seg_idx}: slope={slope:.6f} GB/step, "
+                f"relative_drift={relative_drift:.4f}, positive_ratio={positive_ratio:.2f}"
+            )
+    return False, ""
+
+
+def check_result(case_name, base_path, cur_path, check_metric, phase=None):
     fail_metric = {}
     metric_list = list(check_metric.keys())
     base_steps, base_metrics = extract_value(base_path, metric_list)
@@ -34,7 +91,7 @@ def check_result(case_name, base_path, cur_path, check_metric):
         f"current steps is not equal to base steps, current steps: {cur_steps}, base steps: {base_steps}"
     )
 
-    publish_comparison_report(case_name, check_metric, base_metrics, cur_metrics, base_path, cur_path)
+    publish_comparison_report(case_name, check_metric, base_metrics, cur_metrics, base_path, cur_path, phase=phase)
 
     for metric, threshold in check_metric.items():
         max_error = 0.0
@@ -42,13 +99,20 @@ def check_result(case_name, base_path, cur_path, check_metric):
         check_flag = True
         if metric == "runtime_info/tgs":
             if cur_steps > 10:
-                relative_errors = abs(np.array(base_metrics[metric][10:-1]) - np.array(cur_metrics[metric][10:-1])) / (
-                    np.array(base_metrics[metric][10:-1])
+                base_vals = np.array(base_metrics[metric][10:-1], dtype=float)
+                cur_vals = np.array(cur_metrics[metric][10:-1], dtype=float)
+                degradation = np.zeros_like(base_vals, dtype=float)
+                valid_base = np.abs(base_vals) >= 1e-10
+                degradation[valid_base] = np.maximum(
+                    (base_vals[valid_base] - cur_vals[valid_base]) / np.abs(base_vals[valid_base]),
+                    0.0,
                 )
-                max_error = np.percentile(relative_errors, 80)
+                max_error = float(np.percentile(degradation, 80))
                 if max_error > threshold:
                     fail_metric[metric] = (
-                        f"{metric} relative error bigger than {threshold} after 10 step, baseline: {base_metrics[metric][10:-1]}, now: {cur_metrics[metric][10:-1]}, relative error: {relative_errors}"
+                        f"{metric} degradation bigger than {threshold} after step 10, "
+                        f"baseline: {base_metrics[metric][10:-1]}, now: {cur_metrics[metric][10:-1]}, "
+                        f"degradation: {degradation.tolist()}"
                     )
                     check_flag = False
                 else:
@@ -56,6 +120,28 @@ def check_result(case_name, base_path, cur_path, check_metric):
             else:
                 logger.warning("It's meaningless to compare tgs because of the small steps.")
                 check_flag = False
+        elif metric == "memory/max_memory_GB":
+            for idx, (old, cur) in enumerate(zip(base_metrics[metric], cur_metrics[metric])):
+                if abs(old) < 1e-10:
+                    relative_error = float("inf") if abs(cur) > 1e-10 else 0.0
+                else:
+                    relative_error = round(abs(old - cur) / abs(old), 2)
+                if relative_error > max_error:
+                    max_error = relative_error
+                    max_error_idx = idx
+                if relative_error > threshold:
+                    fail_metric[metric] = (
+                        f"{metric} relative error bigger than {threshold} in {idx} steps, "
+                        f"baseline: {old:.6f}, now: {cur:.6f}, relative error: {relative_error}"
+                    )
+                    check_flag = False
+                    break
+
+            if check_flag:
+                has_gradient, gradient_info = detect_memory_upward_gradient(cur_metrics[metric])
+                if has_gradient:
+                    fail_metric[metric] = f"{metric} shows sustained upward gradient in current run, {gradient_info}"
+                    check_flag = False
         else:
             for idx, (old, cur) in enumerate(zip(base_metrics[metric], cur_metrics[metric])):
                 if abs(old) < 1e-10:
@@ -82,7 +168,7 @@ def check_result(case_name, base_path, cur_path, check_metric):
     return result, f"Some metric check failed: {fail_metric}"
 
 
-def check_rl_result(case_name, base_path, cur_path, assert_info):
+def check_rl_result(case_name, base_path, cur_path, assert_info, phase=None):
     fail_metric = {}
     check_metrics_list = assert_info["check_metrics"]
 
@@ -96,7 +182,9 @@ def check_rl_result(case_name, base_path, cur_path, assert_info):
     )
 
     check_metric_dict = {item["metric"]: item["threshold"] for item in check_metrics_list}
-    publish_comparison_report(case_name, check_metric_dict, base_metrics, cur_metrics, base_path, cur_path)
+    publish_comparison_report(
+        case_name, check_metric_dict, base_metrics, cur_metrics, base_path, cur_path, phase=phase
+    )
 
     for config in check_metrics_list:
         metric = config["metric"]
diff --git a/autotest/utils/metric_report.py b/autotest/utils/metric_report.py
index 37e790e1b..58e1ef5bd 100644
--- a/autotest/utils/metric_report.py
+++ b/autotest/utils/metric_report.py
@@ -21,12 +21,16 @@ def get_report_dir() -> Path:
     return report_dir
 
 
-def report_image_url(case_name: str) -> str:
+def report_suffix_from_phase(phase: str | None) -> str:
+    return "_resume" if phase == "resume" else ""
+
+
+def report_image_url(case_name: str, report_suffix: str = "") -> str:
     run_id = os.environ.get("GITHUB_RUN_ID", "0")
     raw_base = os.environ.get("CI_REPORTS_RAW_URL_BASE", "").rstrip("/") or DEFAULT_RAW_URL_BASE
     device = os.environ.get("DEVICE", "")
     prefix = f"{raw_base}/npu" if device == "npu" else raw_base
-    return f"{prefix}/{run_id}/{case_name}_comparison.png"
+    return f"{prefix}/{run_id}/{case_name}{report_suffix}_comparison.png"
 
 
 def plot_comparison(
@@ -35,6 +39,7 @@ def plot_comparison(
     base_metrics: dict,
     cur_metrics: dict,
     output_root: Path,
+    report_suffix: str = "",
 ) -> Path:
     metric_list = list(metric_keys.keys())
     n_plots = len(metric_list)
@@ -72,9 +77,10 @@ def plot_comparison(
         else:
             ax.axis("off")
 
-    fig.suptitle(f"{case_name}_metrics_comparison", fontsize=16)
+    title = f"{case_name}{report_suffix}_metrics_comparison"
+    fig.suptitle(title, fontsize=16)
     plt.tight_layout()
-    output_path = output_root / f"{case_name}_comparison.png"
+    output_path = output_root / f"{case_name}{report_suffix}_comparison.png"
     plt.savefig(output_path, dpi=100, bbox_inches="tight")
     plt.close()
     return output_path
@@ -118,11 +124,13 @@ def format_jsonl_preview(path: str, label: str) -> str:
     return md
 
 
-def append_case_to_step_summary(case_name: str, base_jsonl: str, cur_jsonl: str) -> None:
+def append_case_to_step_summary(case_name: str, base_jsonl: str, cur_jsonl: str, report_suffix: str = "") -> None:
     summary_file = os.environ.get("GITHUB_STEP_SUMMARY", "./tmp.md")
-    image_url = report_image_url(case_name)
+    image_url = report_image_url(case_name, report_suffix)
+    phase_label = "resume" if report_suffix == "_resume" else None
+    title = f"{case_name} ({phase_label})" if phase_label else case_name
     with open(summary_file, "a", encoding="utf-8") as f:
-        f.write(f"## {case_name} 指标比较图\n")
+        f.write(f"## {title} 指标比较图\n")
         f.write('<div align="center">\n')
         f.write(f'<img src="{image_url}"\n')
         f.write('  style="max-width: 90%; border: 1px solid #ddd; border-radius: 8px;">\n')
@@ -146,10 +154,12 @@ def publish_comparison_report(
     cur_metrics: dict,
     base_jsonl: str,
     cur_jsonl: str,
+    phase: str | None = None,
 ) -> Path:
     """Write comparison PNG under ``{GITHUB_RUN_ID}/`` and append job
     summary."""
+    report_suffix = report_suffix_from_phase(phase)
     output_root = get_report_dir()
-    plot_path = plot_comparison(case_name, metric_keys, base_metrics, cur_metrics, output_root)
-    append_case_to_step_summary(case_name, base_jsonl, cur_jsonl)
+    plot_path = plot_comparison(case_name, metric_keys, base_metrics, cur_metrics, output_root, report_suffix)
+    append_case_to_step_summary(case_name, base_jsonl, cur_jsonl, report_suffix)
     return plot_path

From 4281e01821865a770950e08465ad35681727f31e Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Fri, 12 Jun 2026 10:20:27 +0800
Subject: [PATCH 05/11] update

---
 autotest/module/train.py | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/autotest/module/train.py b/autotest/module/train.py
index ff2668e8e..1ed84bbac 100644
--- a/autotest/module/train.py
+++ b/autotest/module/train.py
@@ -7,7 +7,7 @@
 from utils.run_cmd import run_cmd
 
 
-FIRST_RUN_TRACKER_SNAPSHOT = "_first_run_tracker.jsonl"
+FIRST_RUN_TRACKER_SNAPSHOT = "first_run_tracker.jsonl"
 
 
 class Train:
@@ -83,7 +83,11 @@ def validate(config):
         phase = config.get("phase")
         context = config.get("context", {})
 
-        cur_path = resolve_tracker_path(work_dir, train_type, phase, context=context)
+        run_id = config.get("run_id")
+
+        cur_path = resolve_tracker_path(
+            work_dir, train_type, phase, context=context, run_id=run_id, case_name=case_name
+        )
 
         if train_type == "sft":
             check_metrics = config.get("assert_info", {}).get("check_metrics", {})
@@ -95,7 +99,7 @@ def validate(config):
             print("Unknown type: {train_type}")
             return False
 
-        snapshot_first_run_tracker(work_dir, phase, cur_path, context=context)
+        snapshot_first_run_tracker(run_id, case_name, train_type, phase, cur_path, context=context)
         return result
 
     def pre_action(config=None):
@@ -131,8 +135,9 @@ def _tracker_path(exp_dir: str | None, train_type: str) -> str:
     return os.path.join(exp_dir, _tracker_relpath(train_type))
 
 
-def _snapshot_path(work_dir: str) -> str:
-    return os.path.join(work_dir, FIRST_RUN_TRACKER_SNAPSHOT)
+def _snapshot_path(run_id: str | None, case_name: str, train_type: str) -> str:
+    snapshot_dir = os.path.join(os.getcwd(), str(run_id or "0"), ".snapshots", case_name, train_type)
+    return os.path.join(snapshot_dir, FIRST_RUN_TRACKER_SNAPSHOT)
 
 
 def _write_first_run_segment(src: str, dst: str) -> None:
@@ -163,12 +168,16 @@ def resolve_tracker_path(
     train_type: str,
     phase: str | None,
     context: dict[str, Any] | None = None,
+    run_id: str | None = None,
+    case_name: str | None = None,
 ) -> str:
     context = context or {}
-    snapshot = context.get("first_run_tracker") or _snapshot_path(work_dir)
+    snapshot = context.get("first_run_tracker")
+    if snapshot is None and case_name is not None:
+        snapshot = _snapshot_path(run_id, case_name, train_type)
 
     if phase == "first":
-        if os.path.isfile(snapshot):
+        if snapshot and os.path.isfile(snapshot):
             return snapshot
 
         subdirs = list_timestamp_subdirs(work_dir)
@@ -178,7 +187,7 @@ def resolve_tracker_path(
             exp_dir = os.path.join(work_dir, subdirs[-1]) if subdirs else None
         live_tracker = _tracker_path(exp_dir, train_type)
 
-        if os.path.isfile(live_tracker) and _has_duplicate_steps(live_tracker):
+        if snapshot and os.path.isfile(live_tracker) and _has_duplicate_steps(live_tracker):
             _write_first_run_segment(live_tracker, snapshot)
             if os.path.isfile(snapshot) and os.path.getsize(snapshot) > 0:
                 return snapshot
@@ -190,14 +199,17 @@ def resolve_tracker_path(
 
 
 def snapshot_first_run_tracker(
-    work_dir: str,
+    run_id: str | None,
+    case_name: str,
+    train_type: str,
     phase: str | None,
     cur_path: str,
     context: dict[str, Any] | None = None,
 ) -> None:
     if phase != "first" or not os.path.isfile(cur_path):
         return
-    snapshot = _snapshot_path(work_dir)
+    snapshot = _snapshot_path(run_id, case_name, train_type)
+    os.makedirs(os.path.dirname(snapshot), exist_ok=True)
     if cur_path != snapshot:
         shutil.copy2(cur_path, snapshot)
     if context is not None:

From 1d7c82929c5db98ad00a06784a3ae606885072ec Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Fri, 12 Jun 2026 17:21:27 +0800
Subject: [PATCH 06/11] updsate

---
 autotest/config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotest/config.yaml b/autotest/config.yaml
index 0e31b3bbd..ee934eea4 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -752,7 +752,7 @@ case:
                        operator: <=
                     -
                        metric: response/response_len/mean
-                       threshold: 0.12
+                       threshold: 0.15
                        method: relative
                        operator: <
                     -

From 7f98c3621384116b4f474c2bddb43ea4ec0479ca Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Tue, 16 Jun 2026 14:42:23 +0800
Subject: [PATCH 07/11] update

---
 autotest/cluster/clusterx.py | 84 +++++++++++++++++++++++++++++++++---
 1 file changed, 79 insertions(+), 5 deletions(-)

diff --git a/autotest/cluster/clusterx.py b/autotest/cluster/clusterx.py
index 688642584..e6ff04e83 100644
--- a/autotest/cluster/clusterx.py
+++ b/autotest/cluster/clusterx.py
@@ -1,10 +1,15 @@
+import re
 import time
 import traceback
 from typing import Any, Dict, Optional
 
 from clusterx.config import CLUSTER
 from clusterx.launcher import CLUSTER_MAPPING
-from clusterx.launcher.base import JobStatus
+from clusterx.launcher.base import JobSchema, JobStatus
+
+
+JOB_LOOKUP_RETRY_INTERVAL_S = 5
+JOB_LOOKUP_RETRY_TIMES = 6
 
 
 class ClusterTaskExecutor:
@@ -36,9 +41,9 @@ def execute_task(self, task_config: Dict[str, Any]):
 
         all_command.append(command)
         run_command = "; ".join(all_command)
+        job_name = "-".join([task_config["type"], task_config["case_name"], task_config["run_id"]])
 
         try:
-            job_name = "-".join([task_config["type"], task_config["case_name"], task_config["run_id"]])
             params = self.params_cls(
                 job_name=job_name,
                 cmd=run_command,
@@ -50,13 +55,22 @@ def execute_task(self, task_config: Dict[str, Any]):
                 num_nodes=resource.get("num_nodes", 1),
                 image=resource.get("image", None),
                 no_env=resource.get("no_env", True),
-                image_pull_policy=resource.get("image_pull_policy","Always"),
+                image_pull_policy=resource.get("image_pull_policy", "Always"),
             )
 
             job_schema = self.cluster.run(params)
         except Exception as e:
             traceback.print_exc()
-            raise RuntimeError(f"clusterx job {job_name} start fail, task config is {task_config}, exception is: {e}")
+            job_schema = self._lookup_job_schema(job_name)
+            if job_schema is None:
+                raise RuntimeError(
+                    f"clusterx job {job_name} start fail and lookup found no matching job, "
+                    f"task config is {task_config}, exception is: {e}"
+                )
+            print(
+                f"clusterx job {job_name} submit error recovered via lookup: "
+                f"job_id={job_schema.job_id}, status={job_schema.status}, original exception: {e}"
+            )
 
         start_time = time.time()
         run_start_time = None
@@ -68,7 +82,7 @@ def execute_task(self, task_config: Dict[str, Any]):
             if status in [JobStatus.SUCCEEDED]:
                 run_time = time.time() - run_start_time
                 if run_time >= timeout:
-                    return False, f'Task succeeded, but run time is {run_time}, exceeding then {timeout}'
+                    return False, f"Task succeeded, but run time is {run_time}, exceeding then {timeout}"
                 else:
                     return True, "Task succeeded"
             elif status in [JobStatus.FAILED, JobStatus.STOPPED]:
@@ -91,6 +105,66 @@ def execute_task(self, task_config: Dict[str, Any]):
                 )
             time.sleep(10)
 
+    @staticmethod
+    def _job_name_matches(candidate: str | None, job_name: str) -> bool:
+        if not candidate:
+            return False
+        return candidate == job_name or candidate.startswith(f"{job_name}-")
+
+    def _pick_latest_job(self, jobs: list[JobSchema]) -> JobSchema:
+        return max(jobs, key=lambda job: job.job_id or job.job_name or "")
+
+    def _lookup_job_schema_once(self, job_name: str) -> JobSchema | None:
+        try:
+            return self.cluster.get_job_info(job_name)
+        except Exception:
+            pass
+
+        name_regex = rf"^{re.escape(job_name)}(-.*)?$"
+        try:
+            jobs = self.cluster.list_jobs(regex=name_regex, num=50)
+            if jobs:
+                return self._pick_latest_job(jobs)
+        except Exception as e:
+            print(f"list_jobs lookup for {job_name} failed: {e}")
+
+        client = getattr(self.cluster, "client", None)
+        get_job_name = getattr(self.cluster, "_get_job_name", None)
+        if client is not None and get_job_name is not None:
+            try:
+                matched_names = [
+                    get_job_name(job)
+                    for job in (client.list() or [])
+                    if self._job_name_matches(get_job_name(job), job_name)
+                ]
+                if matched_names:
+                    return self.cluster.get_job_info(max(matched_names))
+            except Exception as e:
+                print(f"brainpp client list lookup for {job_name} failed: {e}")
+
+        try:
+            jobs = self.cluster.list_jobs(num=100)
+            matched = [job for job in jobs if self._job_name_matches(job.job_id, job_name)]
+            if matched:
+                return self._pick_latest_job(matched)
+        except Exception as e:
+            print(f"generic list_jobs lookup for {job_name} failed: {e}")
+
+        return None
+
+    def _lookup_job_schema(self, job_name: str) -> JobSchema | None:
+        for attempt in range(1, JOB_LOOKUP_RETRY_TIMES + 1):
+            job_schema = self._lookup_job_schema_once(job_name)
+            if job_schema is not None:
+                return job_schema
+            if attempt < JOB_LOOKUP_RETRY_TIMES:
+                print(
+                    f"Job {job_name} not found on attempt {attempt}/{JOB_LOOKUP_RETRY_TIMES}, "
+                    f"retry in {JOB_LOOKUP_RETRY_INTERVAL_S}s"
+                )
+                time.sleep(JOB_LOOKUP_RETRY_INTERVAL_S)
+        return None
+
     def get_task_status(self, job_id: str) -> Optional[JobStatus]:
         try:
             status = self.cluster.get_job_info(job_id).status

From 8ccca84a2499f93593d2973cb9803644abe31592 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Tue, 16 Jun 2026 19:19:05 +0800
Subject: [PATCH 08/11] update

---
 autotest/config/qwen3_fp8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotest/config/qwen3_fp8.py b/autotest/config/qwen3_fp8.py
index 1147ae16e..132db8a38 100644
--- a/autotest/config/qwen3_fp8.py
+++ b/autotest/config/qwen3_fp8.py
@@ -21,7 +21,7 @@
     scaling_granularity_grouped_gemm=ScalingGranularity.TILEWISE,
 )
 
-moe_cfg = Qwen3MoE30BA3Config(float8_cfg=float8_cfg, ep_size=8, compile_cfg=False)
+moe_cfg = Qwen3MoE30BA3Config(float8_cfg=float8_cfg, ep_size=8, compile_cfg=False, balancing_loss_cfg=None)
 optim_cfg = AdamWConfig(lr=6e-05)
 lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
 fsdp_cfg = FSDPConfig(

From 67dab857805a461d48ed127cc16dea30c5b72f14 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Wed, 17 Jun 2026 13:51:10 +0800
Subject: [PATCH 09/11] update

---
 autotest/config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/autotest/config.yaml b/autotest/config.yaml
index ee934eea4..9e3b0c9eb 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -848,7 +848,7 @@ case:
                        threshold: 20
                        method: absolute
                        operator: <
-            timeout: 7200
+            timeout: 9000
 
     qwen3-5-rl-vl-lmdeploy-dapo:
         -
@@ -892,7 +892,7 @@ case:
                        threshold: 20
                        method: absolute
                        operator: <
-            timeout: 7200
+            timeout: 9000
 
     qwen3-5-rl-vl-lmdeploy-resume:
         -

From abe58f4294431799e3e167bddca2ee42d26d7334 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Mon, 22 Jun 2026 11:09:37 +0800
Subject: [PATCH 10/11] Update qwen3_fp8.py

---
 autotest/config/qwen3_fp8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotest/config/qwen3_fp8.py b/autotest/config/qwen3_fp8.py
index 132db8a38..1147ae16e 100644
--- a/autotest/config/qwen3_fp8.py
+++ b/autotest/config/qwen3_fp8.py
@@ -21,7 +21,7 @@
     scaling_granularity_grouped_gemm=ScalingGranularity.TILEWISE,
 )
 
-moe_cfg = Qwen3MoE30BA3Config(float8_cfg=float8_cfg, ep_size=8, compile_cfg=False, balancing_loss_cfg=None)
+moe_cfg = Qwen3MoE30BA3Config(float8_cfg=float8_cfg, ep_size=8, compile_cfg=False)
 optim_cfg = AdamWConfig(lr=6e-05)
 lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
 fsdp_cfg = FSDPConfig(

From cb18f3510c4f3ba62f92c36d12f7c2ad262dcb22 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulinJulia24@163.com>
Date: Tue, 23 Jun 2026 11:35:53 +0800
Subject: [PATCH 11/11] update

---
 autotest/config.yaml           |  40 +++-------
 autotest/utils/check_metric.py | 130 ++++++++++++++++++++++++++++++---
 2 files changed, 130 insertions(+), 40 deletions(-)

diff --git a/autotest/config.yaml b/autotest/config.yaml
index 9e3b0c9eb..fec0137cb 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -823,11 +823,6 @@ case:
             assert_info:
                 base_metric: qwen3-5-rl-vl-lmdeploy-grpo/tracker.jsonl
                 check_metrics:
-                    - 
-                       metric:  eval/accuracy
-                       threshold: 0.1
-                       method: absolute
-                       operator: <
                     -
                        metric: response/rewards/mean
                        threshold: 0.3
@@ -845,9 +840,10 @@ case:
                        operator: <
                     -
                        metric: time/step
-                       threshold: 20
-                       method: absolute
+                       threshold: 0.15
+                       method: relative
                        operator: <
+                       aggregate: 80
             timeout: 9000
 
     qwen3-5-rl-vl-lmdeploy-dapo:
@@ -867,11 +863,6 @@ case:
             assert_info:
                 base_metric: qwen3-5-rl-vl-lmdeploy-dapo/tracker.jsonl
                 check_metrics:
-                    - 
-                       metric:  eval/accuracy
-                       threshold: 0.1
-                       method: absolute
-                       operator: <
                     -
                        metric: response/rewards/mean
                        threshold: 0.3
@@ -889,9 +880,10 @@ case:
                        operator: <
                     -
                        metric: time/step
-                       threshold: 20
-                       method: absolute
+                       threshold: 0.15
+                       method: relative
                        operator: <
+                       aggregate: 80
             timeout: 9000
 
     qwen3-5-rl-vl-lmdeploy-resume:
@@ -913,11 +905,6 @@ case:
             assert_info:
                 base_metric: qwen3-5-rl-vl-lmdeploy-resume/tracker.jsonl
                 check_metrics:
-                    - 
-                       metric:  eval/accuracy
-                       threshold: 0.1
-                       method: absolute
-                       operator: <
                     -
                        metric: response/rewards/mean
                        threshold: 0.3
@@ -935,9 +922,10 @@ case:
                        operator: <
                     -
                        metric: time/step
-                       threshold: 20
-                       method: absolute
+                       threshold: 0.15
+                       method: relative
                        operator: <
+                       aggregate: 80
             timeout: 7200
 
         -
@@ -960,11 +948,6 @@ case:
             assert_info:
                 base_metric: qwen3-5-rl-vl-lmdeploy-resume/tracker-resume.jsonl
                 check_metrics:
-                    - 
-                       metric:  eval/accuracy
-                       threshold: 0.1
-                       method: absolute
-                       operator: <
                     -
                        metric: response/rewards/mean
                        threshold: 0.3
@@ -982,9 +965,10 @@ case:
                        operator: <
                     -
                        metric: time/step
-                       threshold: 20
-                       method: absolute
+                       threshold: 0.15
+                       method: relative
                        operator: <
+                       aggregate: 80
             timeout: 7200
 
     qwen3-5-rl-lmdeploy-dapo:
diff --git a/autotest/utils/check_metric.py b/autotest/utils/check_metric.py
index fe0a0be12..c38368845 100644
--- a/autotest/utils/check_metric.py
+++ b/autotest/utils/check_metric.py
@@ -17,6 +17,13 @@
 MEMORY_GRADIENT_MIN_REL_DRIFT = 0.00015
 MEMORY_GRADIENT_RESUME_DROP_GB = 0.005
 
+# RL tracker lines: mini-batch logs vs per-RL-step summary (see rl_trainer._log_step).
+RL_STEP_SUMMARY_MARKER = "response/rewards/mean"
+RL_PERCENTILE_METRICS: dict[str, int] = {
+    "response/response_len/mean": 80,
+    "response/rewards/mean": 80,
+}
+
 
 def extract_value(file, metrics):
     metric_all = {metric: [] for metric in metrics}
@@ -32,6 +39,73 @@ def extract_value(file, metrics):
     return total_step, metric_all
 
 
+def extract_rl_value(file, metrics):
+    """Extract metrics from RL step-summary lines only (ignore mini-batch
+    rows)."""
+    metric_all = {metric: [] for metric in metrics}
+    total_step = 0
+    with open(file) as f:
+        for line in f:
+            record = json.loads(line)
+            if RL_STEP_SUMMARY_MARKER not in record:
+                continue
+            total_step += 1
+            for metric in metrics:
+                if metric in record:
+                    metric_all[metric].append(record[metric])
+    return total_step, metric_all
+
+
+def _step_errors(base_vals: list[float], cur_vals: list[float], method: str) -> list[float]:
+    errors: list[float] = []
+    for base_val, cur_val in zip(base_vals, cur_vals):
+        if method == "absolute":
+            errors.append(abs(cur_val - base_val))
+        elif method == "relative":
+            if abs(base_val) < 1e-10:
+                errors.append(float("inf") if abs(cur_val) > 1e-10 else 0.0)
+            else:
+                errors.append(abs(cur_val - base_val) / abs(base_val))
+        else:
+            raise ValueError(f"Unknown method: {method}")
+    return errors
+
+
+def _percentile_error_passes(
+    base_vals: list[float],
+    cur_vals: list[float],
+    *,
+    method: str,
+    threshold: float,
+    operator: str,
+    percentile: int,
+) -> tuple[bool, float, str]:
+    errors = _step_errors(base_vals, cur_vals, method)
+    agg_error = float(np.percentile(errors, percentile))
+    if operator == "<":
+        passed = agg_error < threshold
+    elif operator == "<=":
+        passed = agg_error <= threshold
+    else:
+        raise ValueError(f"Unknown operator: {operator}")
+    detail = f"p{percentile}={agg_error:.6f} (max={max(errors):.6f})"
+    return passed, agg_error, detail
+
+
+def _format_rl_metric_failure(
+    metric: str,
+    *,
+    method: str,
+    operator: str,
+    threshold: float,
+    detail: str,
+) -> str:
+    return (
+        f"{metric} aggregated error does not satisfy threshold {threshold} "
+        f"(method: {method}, operator: {operator}, {detail})"
+    )
+
+
 def _split_memory_segments(values: np.ndarray) -> list[np.ndarray]:
     if len(values) < MEMORY_GRADIENT_MIN_SEGMENT_LEN:
         return [values]
@@ -174,11 +248,11 @@ def check_rl_result(case_name, base_path, cur_path, assert_info, phase=None):
 
     metric_list = [item["metric"] for item in check_metrics_list]
 
-    base_steps, base_metrics = extract_value(base_path, metric_list)
-    cur_steps, cur_metrics = extract_value(cur_path, metric_list)
+    base_steps, base_metrics = extract_rl_value(base_path, metric_list)
+    cur_steps, cur_metrics = extract_rl_value(cur_path, metric_list)
 
     assert cur_steps == base_steps, (
-        f"current steps is not equal to base steps, current steps: {cur_steps}, base steps: {base_steps}"
+        f"current RL steps is not equal to base RL steps, current steps: {cur_steps}, base steps: {base_steps}"
     )
 
     check_metric_dict = {item["metric"]: item["threshold"] for item in check_metrics_list}
@@ -191,21 +265,53 @@ def check_rl_result(case_name, base_path, cur_path, assert_info, phase=None):
         threshold = config["threshold"]
         method = config["method"]
         operator = config["operator"]
+        percentile = config.get("aggregate")
+        if percentile is None and metric in RL_PERCENTILE_METRICS:
+            percentile = RL_PERCENTILE_METRICS[metric]
+
+        base_vals = base_metrics[metric]
+        cur_vals = cur_metrics[metric]
+        if not base_vals and not cur_vals:
+            logger.warning(f"Skip {metric}: absent in both baseline and current RL step summaries.")
+            continue
+        if len(base_vals) != len(cur_vals):
+            fail_metric[metric] = (
+                f"{metric} step count mismatch after RL step-summary extraction: "
+                f"baseline={len(base_vals)}, current={len(cur_vals)}"
+            )
+            continue
 
         max_error = 0.0
         max_error_idx = 0
         check_flag = True
 
-        for idx, (base_val, cur_val) in enumerate(zip(base_metrics[metric], cur_metrics[metric])):
-            if method == "absolute":
-                error = round(abs(cur_val - base_val), 5)
-            elif method == "relative":
-                if abs(base_val) < 1e-10:
-                    error = float("inf") if abs(cur_val) > 1e-10 else 0.0
-                else:
-                    error = round(abs(cur_val - base_val) / abs(base_val), 5)
+        if percentile is not None:
+            check_flag, agg_error, detail = _percentile_error_passes(
+                base_vals,
+                cur_vals,
+                method=method,
+                threshold=threshold,
+                operator=operator,
+                percentile=int(percentile),
+            )
+            if not check_flag:
+                fail_metric[metric] = _format_rl_metric_failure(
+                    metric,
+                    method=method,
+                    operator=operator,
+                    threshold=threshold,
+                    detail=detail,
+                )
             else:
-                raise ValueError(f"Unknown method: {method}")
+                logger.info(
+                    f"✓ {metric} check passed ({detail}, method: {method}, operator: {operator}, "
+                    f"threshold: {threshold})"
+                )
+            continue
+
+        for idx, (base_val, cur_val) in enumerate(zip(base_vals, cur_vals)):
+            errors = _step_errors([base_val], [cur_val], method)
+            error = round(errors[0], 5)
 
             if error > max_error:
                 max_error = error