strix/analyzer.py at main · primatrix/strix · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
from __future__ import annotations

from dataclasses import dataclass
from typing import Dict

from .op_events import OpEvent, OpKind, OpStream, RepeatedBlockEvent, PatternBlockEvent


@dataclass
class AnalysisReport:
    bottleneck: str
    stall_ratio: float
    arithmetic_intensity: float
    instruction_mix: Dict[str, float]
    total_flops: int = 0
    total_bytes: int = 0
    total_time_ns: int = 0
    makespan_ns: int = 0


class PerformanceAnalyzer:
    """
    Interprets an OpEvent tree into high-level insights:
      * bottleneck classification (compute / memory / latency),
      * stall ratio,
      * arithmetic intensity,
      * instruction mix (by time share).

    All aggregation happens here; the simulator is responsible only for
    producing per-execution OpEvents.
    """

    def __init__(self, root_event: OpEvent):
        self.root = root_event

    def analyze(self) -> AnalysisReport:
        if self.root is None:
            return AnalysisReport(
                bottleneck="Empty",
                stall_ratio=0.0,
                arithmetic_intensity=0.0,
                instruction_mix={},
                total_flops=0,
                total_bytes=0,
                total_time_ns=0,
                makespan_ns=0,
            )

        # Aggregate metrics from the OpEvent tree.
        total_flops = 0
        total_bytes = 0
        total_time_ns = 0
        vpu_active = 0
        dma_active = 0
        stall_total = 0
        per_op_time_ns: Dict[str, int] = {}

        def visit(ev: OpEvent) -> None:
            nonlocal total_flops, total_bytes, total_time_ns
            nonlocal vpu_active, dma_active, stall_total

            # Leaf / block / loop / stall events contribute directly.
            # Container events without their own work (ROOT/IF) are accounted
            # for via their children.
            # LOOP events now carry scaled metrics (flops/bytes/time × trip_count)
            # and should be counted as a unit, not recursed into.
            if ev.kind in (OpKind.LEAF, OpKind.BLOCK):
                total_flops += ev.flops
                total_bytes += ev.bytes
                total_time_ns += ev.duration_ns
                per_op_time_ns[ev.name] = per_op_time_ns.get(ev.name, 0) + ev.duration_ns

                if ev.stream == OpStream.VPU:
                    vpu_active += ev.duration_ns
                elif ev.stream == OpStream.DMA:
                    dma_active += ev.duration_ns

                # Recurse into children for these types
                for child in ev.children:
                    visit(child)

            elif ev.kind == OpKind.LOOP:
                # LOOP events are now self-contained with scaled metrics.
                # We count the loop as a whole, not its children.
                total_flops += ev.flops
                total_bytes += ev.bytes
                total_time_ns += ev.duration_ns
                per_op_time_ns[ev.name] = per_op_time_ns.get(ev.name, 0) + ev.duration_ns

                if ev.stream == OpStream.VPU:
                    vpu_active += ev.duration_ns
                elif ev.stream == OpStream.DMA:
                    dma_active += ev.duration_ns
                # Do NOT recurse into loop children (already scaled)

            elif ev.kind == OpKind.STALL:
                stall_total += ev.duration_ns
                # Stalls don't have children

            elif isinstance(ev, (RepeatedBlockEvent, PatternBlockEvent)):
                # These block events are just containers for compressed children
                # Don't count them directly - their children are already counted
                # No recursion needed - children timing already aggregated
                pass

            else:
                # Container events like ROOT, IF: recurse into children
                for child in ev.children:
                    visit(child)

        visit(self.root)

        makespan = max(self.root.end_time_ns - self.root.start_time_ns, 0)
        if makespan == 0:
            return AnalysisReport(
                bottleneck="Empty",
                stall_ratio=0.0,
                arithmetic_intensity=0.0,
                instruction_mix={},
                    total_flops=total_flops,
                    total_bytes=total_bytes,
                    total_time_ns=total_time_ns,
                    makespan_ns=makespan,
            )

        vpu_util = vpu_active / makespan
        dma_util = dma_active / makespan
        stall_ratio = stall_total / makespan

        if vpu_util > 0.85:
            bottleneck = "Compute Bound (VPU)"
        elif dma_util > 0.85:
            bottleneck = "Memory Bound (DMA)"
        elif stall_ratio > 0.4:
            bottleneck = "Latency Bound (Stalled)"
        else:
            bottleneck = "Balanced / Mixed"

        # Instruction mix by time (leaf events only).
        mix: Dict[str, float] = {}
        if total_time_ns > 0:
            for op, t in per_op_time_ns.items():
                mix[op] = t / total_time_ns

        if total_bytes > 0:
            arithmetic_intensity = float(total_flops) / float(total_bytes)
        else:
            arithmetic_intensity = 0.0

        return AnalysisReport(
            bottleneck=bottleneck,
            stall_ratio=stall_ratio,
            arithmetic_intensity=arithmetic_intensity,
            instruction_mix=mix,
            total_flops=total_flops,
            total_bytes=total_bytes,
            total_time_ns=total_time_ns,
            makespan_ns=makespan,
        )