-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalyzer.py
More file actions
159 lines (133 loc) · 5.42 KB
/
analyzer.py
File metadata and controls
159 lines (133 loc) · 5.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
from __future__ import annotations
from dataclasses import dataclass
from typing import Dict
from .op_events import OpEvent, OpKind, OpStream, RepeatedBlockEvent, PatternBlockEvent
@dataclass
class AnalysisReport:
bottleneck: str
stall_ratio: float
arithmetic_intensity: float
instruction_mix: Dict[str, float]
total_flops: int = 0
total_bytes: int = 0
total_time_ns: int = 0
makespan_ns: int = 0
class PerformanceAnalyzer:
"""
Interprets an OpEvent tree into high-level insights:
* bottleneck classification (compute / memory / latency),
* stall ratio,
* arithmetic intensity,
* instruction mix (by time share).
All aggregation happens here; the simulator is responsible only for
producing per-execution OpEvents.
"""
def __init__(self, root_event: OpEvent):
self.root = root_event
def analyze(self) -> AnalysisReport:
if self.root is None:
return AnalysisReport(
bottleneck="Empty",
stall_ratio=0.0,
arithmetic_intensity=0.0,
instruction_mix={},
total_flops=0,
total_bytes=0,
total_time_ns=0,
makespan_ns=0,
)
# Aggregate metrics from the OpEvent tree.
total_flops = 0
total_bytes = 0
total_time_ns = 0
vpu_active = 0
dma_active = 0
stall_total = 0
per_op_time_ns: Dict[str, int] = {}
def visit(ev: OpEvent) -> None:
nonlocal total_flops, total_bytes, total_time_ns
nonlocal vpu_active, dma_active, stall_total
# Leaf / block / loop / stall events contribute directly.
# Container events without their own work (ROOT/IF) are accounted
# for via their children.
# LOOP events now carry scaled metrics (flops/bytes/time × trip_count)
# and should be counted as a unit, not recursed into.
if ev.kind in (OpKind.LEAF, OpKind.BLOCK):
total_flops += ev.flops
total_bytes += ev.bytes
total_time_ns += ev.duration_ns
per_op_time_ns[ev.name] = per_op_time_ns.get(ev.name, 0) + ev.duration_ns
if ev.stream == OpStream.VPU:
vpu_active += ev.duration_ns
elif ev.stream == OpStream.DMA:
dma_active += ev.duration_ns
# Recurse into children for these types
for child in ev.children:
visit(child)
elif ev.kind == OpKind.LOOP:
# LOOP events are now self-contained with scaled metrics.
# We count the loop as a whole, not its children.
total_flops += ev.flops
total_bytes += ev.bytes
total_time_ns += ev.duration_ns
per_op_time_ns[ev.name] = per_op_time_ns.get(ev.name, 0) + ev.duration_ns
if ev.stream == OpStream.VPU:
vpu_active += ev.duration_ns
elif ev.stream == OpStream.DMA:
dma_active += ev.duration_ns
# Do NOT recurse into loop children (already scaled)
elif ev.kind == OpKind.STALL:
stall_total += ev.duration_ns
# Stalls don't have children
elif isinstance(ev, (RepeatedBlockEvent, PatternBlockEvent)):
# These block events are just containers for compressed children
# Don't count them directly - their children are already counted
# No recursion needed - children timing already aggregated
pass
else:
# Container events like ROOT, IF: recurse into children
for child in ev.children:
visit(child)
visit(self.root)
makespan = max(self.root.end_time_ns - self.root.start_time_ns, 0)
if makespan == 0:
return AnalysisReport(
bottleneck="Empty",
stall_ratio=0.0,
arithmetic_intensity=0.0,
instruction_mix={},
total_flops=total_flops,
total_bytes=total_bytes,
total_time_ns=total_time_ns,
makespan_ns=makespan,
)
vpu_util = vpu_active / makespan
dma_util = dma_active / makespan
stall_ratio = stall_total / makespan
if vpu_util > 0.85:
bottleneck = "Compute Bound (VPU)"
elif dma_util > 0.85:
bottleneck = "Memory Bound (DMA)"
elif stall_ratio > 0.4:
bottleneck = "Latency Bound (Stalled)"
else:
bottleneck = "Balanced / Mixed"
# Instruction mix by time (leaf events only).
mix: Dict[str, float] = {}
if total_time_ns > 0:
for op, t in per_op_time_ns.items():
mix[op] = t / total_time_ns
if total_bytes > 0:
arithmetic_intensity = float(total_flops) / float(total_bytes)
else:
arithmetic_intensity = 0.0
return AnalysisReport(
bottleneck=bottleneck,
stall_ratio=stall_ratio,
arithmetic_intensity=arithmetic_intensity,
instruction_mix=mix,
total_flops=total_flops,
total_bytes=total_bytes,
total_time_ns=total_time_ns,
makespan_ns=makespan,
)