From cf8cb26b4d52642bd36de16535ef8aabc3e15b29 Mon Sep 17 00:00:00 2001 From: GUOGUO <55723162+Dong1017@users.noreply.github.com> Date: Sat, 21 Mar 2026 14:28:26 +0800 Subject: [PATCH] add: put few known_issues cards into factory and add log --- .../factory/cards/cards.known_issues.log | 57 +++++++++++++++++++ .../known_issues/device-out-of-memory.yaml | 20 +++++++ .../distributed-communication-timeout.yaml | 21 +++++++ .../missing-cann-environment.yaml | 21 +++++++ .../cards/known_issues/ms-context-empty.yaml | 20 +++++++ .../ms-tbe-operator-compilation-error.yaml | 21 +++++++ .../known_issues/stack-version-mismatch.yaml | 21 +++++++ incubating/factory/manifests/pack.yaml | 8 ++- 8 files changed, 188 insertions(+), 1 deletion(-) create mode 100644 incubating/factory/cards/cards.known_issues.log create mode 100644 incubating/factory/cards/known_issues/device-out-of-memory.yaml create mode 100644 incubating/factory/cards/known_issues/distributed-communication-timeout.yaml create mode 100644 incubating/factory/cards/known_issues/missing-cann-environment.yaml create mode 100644 incubating/factory/cards/known_issues/ms-context-empty.yaml create mode 100644 incubating/factory/cards/known_issues/ms-tbe-operator-compilation-error.yaml create mode 100644 incubating/factory/cards/known_issues/stack-version-mismatch.yaml diff --git a/incubating/factory/cards/cards.known_issues.log b/incubating/factory/cards/cards.known_issues.log new file mode 100644 index 0000000..1a5219d --- /dev/null +++ b/incubating/factory/cards/cards.known_issues.log @@ -0,0 +1,57 @@ +2026-03-21 | known_issues | first failure batch + +Summary: +- Added the first failure-oriented Factory `known_issue` batch from the + current `failure-agent` fallback knowledge. +- Expanded `known_issues` in `manifests/pack.yaml` from 3 to 9 cards. + +Selection rationale: +- Prioritized high-frequency, low-ambiguity, reusable runtime failures. +- Preferred entries with stable signatures and conservative fixes. +- Deferred test-framework-specific, hardware-fault-only, and long-tail + backend cases to later batches. + +Added cards: +- `missing-cann-environment` + - Ascend/CANN environment or library setup missing at import/startup time + - file: `known_issues/missing-cann-environment.yaml` + +- `device-out-of-memory` + - device-side memory exhaustion across Ascend and GPU + - file: `known_issues/device-out-of-memory.yaml` + +- `distributed-communication-timeout` + - distributed HCCL communication timeout or rank progress mismatch + - file: `known_issues/distributed-communication-timeout.yaml` + +- `ms-context-empty` + - MindSpore Ascend runtime context not initialized before execution + - file: `known_issues/ms-context-empty.yaml` + +- `ms-tbe-operator-compilation-error` + - MindSpore Ascend TBE operator compilation failure + - file: `known_issues/ms-tbe-operator-compilation-error.yaml` + +- `stack-version-mismatch` + - ABI or compatibility mismatch after partial stack upgrades + - file: `known_issues/stack-version-mismatch.yaml` + +Deferred candidates: +- `feature-or-operator-not-supported` + - useful, but currently more ambiguous and overlaps with mode/shape/dtype + precondition failures +- `pta-cann-inner-error` + - useful, but still broad as an early shared Factory card +- `ai-core-execution-timeout` + - kept in fallback knowledge first because timeout sources are still broad +- `hbm-ecc-error` + - operationally clear, but more hardware-health-oriented than software triage + +Manifest impact: +- updated `incubating/factory/manifests/pack.yaml` +- `card_count.known_issues: 3 -> 9` +- appended 6 `known_issue` entries to the `cards:` list + +Validation: +- Parsed all `incubating/factory/cards/known_issues/*.yaml` +- Verified `pack.yaml` known-issue count matches actual files and manifest entries diff --git a/incubating/factory/cards/known_issues/device-out-of-memory.yaml b/incubating/factory/cards/known_issues/device-out-of-memory.yaml new file mode 100644 index 0000000..2a27ae0 --- /dev/null +++ b/incubating/factory/cards/known_issues/device-out-of-memory.yaml @@ -0,0 +1,20 @@ +kind: known_issue +id: device-out-of-memory +symptom: failure +severity: high +lifecycle: + state: stable +source: + kind: bootstrap +confidence: + level: bootstrap +tags: [oom, memory, allocation, ascend, gpu] +affects_platforms: [ascend, gpu] +detection: + pattern: "EL0004|FAIL_TO_ALLOCATE_MEMORY|200000|207018|out of memory|CUDA out of memory" +description: | + Device memory is exhausted by model size, batch size, fragmentation, or + competing workloads on the same accelerator. The failure is a common + first-line runtime issue on both Ascend and GPU training jobs. +fix: + summary: Reduce memory pressure first by lowering batch size, enabling recompute or checkpointing, and clearing stale cached memory before deeper debugging. diff --git a/incubating/factory/cards/known_issues/distributed-communication-timeout.yaml b/incubating/factory/cards/known_issues/distributed-communication-timeout.yaml new file mode 100644 index 0000000..b2d0d49 --- /dev/null +++ b/incubating/factory/cards/known_issues/distributed-communication-timeout.yaml @@ -0,0 +1,21 @@ +kind: known_issue +id: distributed-communication-timeout +symptom: failure +severity: high +lifecycle: + state: stable +source: + kind: bootstrap +confidence: + level: bootstrap +tags: [hccl, distributed, timeout, communication, ascend] +affects_platforms: [ascend] +detection: + pattern: "HCCL|EI0002|EI0006|107020|notify wait|socket build|times out" +description: | + Distributed execution stalls or aborts because ranks do not make + matching progress, network setup is broken, or the communication layer + times out during collective execution. The same signature often appears + when one rank exits early or startup ordering is inconsistent. +fix: + summary: Verify rank configuration, startup ordering, and network reachability first, then re-check timeout settings and whether one rank exits early. diff --git a/incubating/factory/cards/known_issues/missing-cann-environment.yaml b/incubating/factory/cards/known_issues/missing-cann-environment.yaml new file mode 100644 index 0000000..321645d --- /dev/null +++ b/incubating/factory/cards/known_issues/missing-cann-environment.yaml @@ -0,0 +1,21 @@ +kind: known_issue +id: missing-cann-environment +symptom: failure +severity: high +lifecycle: + state: stable +source: + kind: bootstrap +confidence: + level: bootstrap +tags: [cann, environment, ascend, import, setup] +affects_platforms: [ascend] +detection: + pattern: "libascendcl\\.so not found|libhccl\\.so not found|ASCEND_OPP_PATH|cannot find CANN" +description: | + Runtime initialization fails on Ascend because required CANN libraries + or environment variables are missing, unset, or point to an incomplete + toolkit installation. The failure usually appears during import or + startup before model execution begins. +fix: + summary: Source the Ascend toolkit environment script, verify the CANN installation, and re-check required Ascend environment variables before retrying. diff --git a/incubating/factory/cards/known_issues/ms-context-empty.yaml b/incubating/factory/cards/known_issues/ms-context-empty.yaml new file mode 100644 index 0000000..92da8fe --- /dev/null +++ b/incubating/factory/cards/known_issues/ms-context-empty.yaml @@ -0,0 +1,20 @@ +kind: known_issue +id: ms-context-empty +symptom: failure +severity: high +lifecycle: + state: stable +source: + kind: bootstrap +confidence: + level: bootstrap +tags: [mindspore, context, initialization, runtime, ascend] +affects_platforms: [ascend] +detection: + pattern: "107002|context is empty|aclrtSetContext|aclrtSetDevice" +description: | + Ascend runtime APIs are called before MindSpore establishes a valid + device context for the current process. The failure usually reflects + incorrect initialization order rather than an operator or kernel bug. +fix: + summary: Ensure context initialization happens before tensor or operator execution and verify device setup runs exactly once in the expected startup path. diff --git a/incubating/factory/cards/known_issues/ms-tbe-operator-compilation-error.yaml b/incubating/factory/cards/known_issues/ms-tbe-operator-compilation-error.yaml new file mode 100644 index 0000000..4f302a7 --- /dev/null +++ b/incubating/factory/cards/known_issues/ms-tbe-operator-compilation-error.yaml @@ -0,0 +1,21 @@ +kind: known_issue +id: ms-tbe-operator-compilation-error +symptom: failure +severity: high +lifecycle: + state: stable +source: + kind: bootstrap +confidence: + level: bootstrap +tags: [mindspore, ascend, tbe, compile, kernel, ub] +affects_platforms: [ascend] +detection: + pattern: "TBE|compile failed|E9[0-9A-Z]+|EB[0-9A-Z]+|UB overflow|operator compilation" +description: | + MindSpore fails while compiling a TBE operator on Ascend because the + input shape, dtype, or backend toolchain path violates kernel-generation + constraints. The signature often points to backend compile-time limits + rather than frontend API misuse. +fix: + summary: Recheck the operator input shape and dtype against backend limits, then validate whether the current CANN stack supports that compilation path before forcing fallback behavior. diff --git a/incubating/factory/cards/known_issues/stack-version-mismatch.yaml b/incubating/factory/cards/known_issues/stack-version-mismatch.yaml new file mode 100644 index 0000000..19d0efe --- /dev/null +++ b/incubating/factory/cards/known_issues/stack-version-mismatch.yaml @@ -0,0 +1,21 @@ +kind: known_issue +id: stack-version-mismatch +symptom: failure +severity: high +lifecycle: + state: stable +source: + kind: bootstrap +confidence: + level: bootstrap +tags: [version, compatibility, abi, symbol, ascend] +affects_platforms: [ascend] +detection: + pattern: "symbol not found|ABI mismatch|version mismatch|import fails after upgrade|compatibility" +description: | + Runtime components were installed from incompatible version sets, so + symbols, kernels, or registrations expected by one layer are missing in + another. The issue commonly appears after upgrading only part of the + stack, such as torch_npu, MindSpore, PyTorch, or CANN. +fix: + summary: Rebuild or reinstall the runtime stack with an explicitly compatible version matrix instead of mixing independently upgraded components. diff --git a/incubating/factory/manifests/pack.yaml b/incubating/factory/manifests/pack.yaml index a53e5a0..bf4a9ed 100644 --- a/incubating/factory/manifests/pack.yaml +++ b/incubating/factory/manifests/pack.yaml @@ -3,7 +3,7 @@ channel: stable created_at: "2026-03-19" card_count: operators: 6 - known_issues: 3 + known_issues: 9 perf_features: 9 algo_features: 9 models: 1 @@ -17,6 +17,12 @@ cards: - { id: dsa-torch27-ascend, kind: known_issue, path: known_issues/dsa-torch27-ascend.yaml } - { id: fp16-softmax-drift, kind: known_issue, path: known_issues/fp16-softmax-drift.yaml } - { id: cann-flash-attn-version, kind: known_issue, path: known_issues/cann-flash-attn-version.yaml } + - { id: missing-cann-environment, kind: known_issue, path: known_issues/missing-cann-environment.yaml } + - { id: device-out-of-memory, kind: known_issue, path: known_issues/device-out-of-memory.yaml } + - { id: distributed-communication-timeout, kind: known_issue, path: known_issues/distributed-communication-timeout.yaml } + - { id: ms-context-empty, kind: known_issue, path: known_issues/ms-context-empty.yaml } + - { id: ms-tbe-operator-compilation-error, kind: known_issue, path: known_issues/ms-tbe-operator-compilation-error.yaml } + - { id: stack-version-mismatch, kind: known_issue, path: known_issues/stack-version-mismatch.yaml } - { id: fused-adam, kind: perf_feature, path: perf_features/fused-adam.yaml } - { id: flash-attn-v2, kind: perf_feature, path: perf_features/flash-attn-v2.yaml } - { id: gradient-ckpt, kind: perf_feature, path: perf_features/gradient-ckpt.yaml }