-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel_architecture
More file actions
711 lines (711 loc) · 22.3 KB
/
model_architecture
File metadata and controls
711 lines (711 loc) · 22.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
digraph {
graph [size="96.45,96.45"]
node [align=left fontname=monospace fontsize=10 height=0.2 ranksep=0.1 shape=box style=filled]
5272679024 [label="
(1, 10)" fillcolor=darkolivegreen1]
5273049072 [label=AddmmBackward0]
5273048256 -> 5273049072
5272812736 [label="fc3.bias
(10)" fillcolor=lightblue]
5272812736 -> 5273048256
5273048256 [label=AccumulateGrad]
5273048208 -> 5273049072
5273048208 [label=ReluBackward0]
5273048880 -> 5273048208
5273048880 [label=NativeBatchNormBackward0]
5273047584 -> 5273048880
5273047584 [label=AddmmBackward0]
5273047152 -> 5273047584
5272812176 [label="fc2.bias
(64)" fillcolor=lightblue]
5272812176 -> 5273047152
5273047152 [label=AccumulateGrad]
5273047824 -> 5273047584
5273047824 [label=ReluBackward0]
5273050992 -> 5273047824
5273050992 [label=NativeBatchNormBackward0]
5273047872 -> 5273050992
5273047872 [label=AddmmBackward0]
5273050800 -> 5273047872
5272651312 [label="fc1.bias
(128)" fillcolor=lightblue]
5272651312 -> 5273050800
5273050800 [label=AccumulateGrad]
5273051088 -> 5273047872
5273051088 [label=ViewBackward0]
5273050416 -> 5273051088
5273050416 [label=NativeLayerNormBackward0]
5273050608 -> 5273050416
5273050608 [label=NativeLayerNormBackward0]
5273049792 -> 5273050608
5273049792 [label=AddBackward0]
5273049984 -> 5273049792
5273049984 [label=NativeLayerNormBackward0]
5273049600 -> 5273049984
5273049600 [label=AddBackward0]
5273049312 -> 5273049600
5273049312 [label=NativeLayerNormBackward0]
5273048928 -> 5273049312
5273048928 [label=AddBackward0]
5273048640 -> 5273048928
5273048640 [label=NativeLayerNormBackward0]
5273048784 -> 5273048640
5273048784 [label=AddBackward0]
5273047968 -> 5273048784
5273047968 [label=NativeLayerNormBackward0]
5273048112 -> 5273047968
5273048112 [label=AddBackward0]
5273047296 -> 5273048112
5273047296 [label=NativeLayerNormBackward0]
5273047440 -> 5273047296
5273047440 [label=AddBackward0]
5273047104 -> 5273047440
5273047104 [label=NativeLayerNormBackward0]
5272694448 -> 5273047104
5272694448 [label=AddBackward0]
5272693824 -> 5272694448
5272693824 [label=NativeLayerNormBackward0]
5272693152 -> 5272693824
5272693152 [label=AddBackward0]
5272692528 -> 5272693152
5272692528 [label=PermuteBackward0]
5272691760 -> 5272692528
5272691760 [label=ReluBackward0]
5272691952 -> 5272691760
5272691952 [label=ConvolutionBackward0]
5272691136 -> 5272691952
5272691136 [label=PermuteBackward0]
5272694544 -> 5272691136
5272694544 [label=CatBackward0]
5272694640 -> 5272694544
5272694640 [label=EmbeddingBackward0]
5272694736 -> 5272694640
4340014432 [label="embedding.weight
(301, 16)" fillcolor=lightblue]
4340014432 -> 5272694736
5272694736 [label=AccumulateGrad]
5272691088 -> 5272691952
5272650912 [label="conv1.weight
(32, 17, 3)" fillcolor=lightblue]
5272650912 -> 5272691088
5272691088 [label=AccumulateGrad]
5272692576 -> 5272691952
5272650992 [label="conv1.bias
(32)" fillcolor=lightblue]
5272650992 -> 5272692576
5272692576 [label=AccumulateGrad]
5272692384 -> 5272693152
5272692384 [label=TransposeBackward0]
5272691280 -> 5272692384
5272691280 [label=ViewBackward0]
5272694592 -> 5272691280
5272694592 [label=AddmmBackward0]
5272694352 -> 5272694592
5272652832 [label="transformer.layers.0.self_attn.out_proj.bias
(32)" fillcolor=lightblue]
5272652832 -> 5272694352
5272694352 [label=AccumulateGrad]
5272694688 -> 5272694592
5272694688 [label=ViewBackward0]
5272691328 -> 5272694688
5272691328 [label=PermuteBackward0]
5272694064 -> 5272691328
5272694064 [label=ScaledDotProductFlashAttentionBackward0]
5272694160 -> 5272694064
5272694160 [label=ViewBackward0]
5272693344 -> 5272694160
5272693344 [label=TransposeBackward0]
5272693440 -> 5272693344
5272693440 [label=ViewBackward0]
5272693536 -> 5272693440
5272693536 [label=SelectBackward0]
5272693104 -> 5272693536
5272693104 [label=CloneBackward0]
5272692720 -> 5272693104
5272692720 [label=SqueezeBackward1]
5272692816 -> 5272692720
5272692816 [label=TransposeBackward0]
5272692912 -> 5272692816
5272692912 [label=UnsqueezeBackward0]
5272692480 -> 5272692912
5272692480 [label=ViewBackward0]
5272692096 -> 5272692480
5272692096 [label=AddBackward0]
5272692192 -> 5272692096
5272692192 [label=UnsafeViewBackward0]
5272691808 -> 5272692192
5272691808 [label=MmBackward0]
5272691424 -> 5272691808
5272691424 [label=ReshapeAliasBackward0]
5272691568 -> 5272691424
5272691568 [label=TransposeBackward0]
5272692528 -> 5272691568
5272691856 -> 5272691808
5272691856 [label=TBackward0]
5272691616 -> 5272691856
5272652112 [label="transformer.layers.0.self_attn.in_proj_weight
(96, 32)" fillcolor=lightblue]
5272652112 -> 5272691616
5272691616 [label=AccumulateGrad]
5272692144 -> 5272692096
5272652192 [label="transformer.layers.0.self_attn.in_proj_bias
(96)" fillcolor=lightblue]
5272652192 -> 5272692144
5272692144 [label=AccumulateGrad]
5272694112 -> 5272694064
5272694112 [label=ViewBackward0]
5272693488 -> 5272694112
5272693488 [label=TransposeBackward0]
5272692672 -> 5272693488
5272692672 [label=ViewBackward0]
5272692864 -> 5272692672
5272692864 [label=SelectBackward0]
5272693104 -> 5272692864
5272693968 -> 5272694064
5272693968 [label=ViewBackward0]
5272692768 -> 5272693968
5272692768 [label=TransposeBackward0]
5272692432 -> 5272692768
5272692432 [label=ViewBackward0]
5272693728 -> 5272692432
5272693728 [label=SelectBackward0]
5272693104 -> 5272693728
5272691712 -> 5272694592
5272691712 [label=TBackward0]
5272693680 -> 5272691712
5272652752 [label="transformer.layers.0.self_attn.out_proj.weight
(32, 32)" fillcolor=lightblue]
5272652752 -> 5272693680
5272693680 [label=AccumulateGrad]
5272693008 -> 5272693824
5272653072 [label="transformer.layers.0.norm1.weight
(32)" fillcolor=lightblue]
5272653072 -> 5272693008
5272693008 [label=AccumulateGrad]
5272692960 -> 5272693824
5272653152 [label="transformer.layers.0.norm1.bias
(32)" fillcolor=lightblue]
5272653152 -> 5272692960
5272692960 [label=AccumulateGrad]
5272693776 -> 5272694448
5272693776 [label=ViewBackward0]
5272691904 -> 5272693776
5272691904 [label=AddmmBackward0]
5272694304 -> 5272691904
5272652992 [label="transformer.layers.0.linear2.bias
(32)" fillcolor=lightblue]
5272652992 -> 5272694304
5272694304 [label=AccumulateGrad]
5272694496 -> 5272691904
5272694496 [label=ViewBackward0]
5272692048 -> 5272694496
5272692048 [label=ReluBackward0]
5272692240 -> 5272692048
5272692240 [label=ViewBackward0]
5272693392 -> 5272692240
5272693392 [label=AddmmBackward0]
5272691184 -> 5272693392
5272652592 [label="transformer.layers.0.linear1.bias
(2048)" fillcolor=lightblue]
5272652592 -> 5272691184
5272691184 [label=AccumulateGrad]
5272692288 -> 5272693392
5272692288 [label=ViewBackward0]
5272693824 -> 5272692288
5272693920 -> 5272693392
5272693920 [label=TBackward0]
5272690800 -> 5272693920
5272652432 [label="transformer.layers.0.linear1.weight
(2048, 32)" fillcolor=lightblue]
5272652432 -> 5272690800
5272690800 [label=AccumulateGrad]
5272693200 -> 5272691904
5272693200 [label=TBackward0]
5272691664 -> 5272693200
5272652912 [label="transformer.layers.0.linear2.weight
(32, 2048)" fillcolor=lightblue]
5272652912 -> 5272691664
5272691664 [label=AccumulateGrad]
5272694400 -> 5273047104
5272653232 [label="transformer.layers.0.norm2.weight
(32)" fillcolor=lightblue]
5272653232 -> 5272694400
5272694400 [label=AccumulateGrad]
5272694256 -> 5273047104
5272653312 [label="transformer.layers.0.norm2.bias
(32)" fillcolor=lightblue]
5272653312 -> 5272694256
5272694256 [label=AccumulateGrad]
5273047536 -> 5273047440
5273047536 [label=TransposeBackward0]
5272692336 -> 5273047536
5272692336 [label=ViewBackward0]
5272693296 -> 5272692336
5272693296 [label=AddmmBackward0]
5272691472 -> 5272693296
5272809536 [label="transformer.layers.1.self_attn.out_proj.bias
(32)" fillcolor=lightblue]
5272809536 -> 5272691472
5272691472 [label=AccumulateGrad]
5272691232 -> 5272693296
5272691232 [label=ViewBackward0]
5272690896 -> 5272691232
5272690896 [label=PermuteBackward0]
5272690992 -> 5272690896
5272690992 [label=ScaledDotProductFlashAttentionBackward0]
5272691040 -> 5272690992
5272691040 [label=ViewBackward0]
5272805088 -> 5272691040
5272805088 [label=TransposeBackward0]
5272805232 -> 5272805088
5272805232 [label=ViewBackward0]
5268259696 -> 5272805232
5268259696 [label=SelectBackward0]
5268259024 -> 5268259696
5268259024 [label=CloneBackward0]
5272610464 -> 5268259024
5272610464 [label=SqueezeBackward1]
5092658960 -> 5272610464
5092658960 [label=TransposeBackward0]
5272610320 -> 5092658960
5272610320 [label=UnsqueezeBackward0]
5273066848 -> 5272610320
5273066848 [label=ViewBackward0]
5273067040 -> 5273066848
5273067040 [label=ViewBackward0]
5273066272 -> 5273067040
5273066272 [label=AddmmBackward0]
5273066464 -> 5273066272
5272653472 [label="transformer.layers.1.self_attn.in_proj_bias
(96)" fillcolor=lightblue]
5272653472 -> 5273066464
5273066464 [label=AccumulateGrad]
5273066416 -> 5273066272
5273066416 [label=ViewBackward0]
5273065600 -> 5273066416
5273065600 [label=TransposeBackward0]
5273047104 -> 5273065600
5273067424 -> 5273066272
5273067424 [label=TBackward0]
5273064928 -> 5273067424
5272653392 [label="transformer.layers.1.self_attn.in_proj_weight
(96, 32)" fillcolor=lightblue]
5272653392 -> 5273064928
5273064928 [label=AccumulateGrad]
5272691520 -> 5272690992
5272691520 [label=ViewBackward0]
5268259648 -> 5272691520
5268259648 [label=TransposeBackward0]
5272805328 -> 5268259648
5272805328 [label=ViewBackward0]
5272610512 -> 5272805328
5272610512 [label=SelectBackward0]
5268259024 -> 5272610512
5272803792 -> 5272690992
5272803792 [label=ViewBackward0]
5272610272 -> 5272803792
5272610272 [label=TransposeBackward0]
5272805280 -> 5272610272
5272805280 [label=ViewBackward0]
5273066224 -> 5272805280
5273066224 [label=SelectBackward0]
5268259024 -> 5273066224
5272693584 -> 5272693296
5272693584 [label=TBackward0]
5272805136 -> 5272693584
5272653712 [label="transformer.layers.1.self_attn.out_proj.weight
(32, 32)" fillcolor=lightblue]
5272653712 -> 5272805136
5272805136 [label=AccumulateGrad]
5273047392 -> 5273047296
5272809776 [label="transformer.layers.1.norm1.weight
(32)" fillcolor=lightblue]
5272809776 -> 5273047392
5273047392 [label=AccumulateGrad]
5273047344 -> 5273047296
5272809856 [label="transformer.layers.1.norm1.bias
(32)" fillcolor=lightblue]
5272809856 -> 5273047344
5273047344 [label=AccumulateGrad]
5273047728 -> 5273048112
5273047728 [label=ViewBackward0]
5272610224 -> 5273047728
5272610224 [label=AddmmBackward0]
5272694016 -> 5272610224
5272809696 [label="transformer.layers.1.linear2.bias
(32)" fillcolor=lightblue]
5272809696 -> 5272694016
5272694016 [label=AccumulateGrad]
5272693632 -> 5272610224
5272693632 [label=ViewBackward0]
5272690944 -> 5272693632
5272690944 [label=ReluBackward0]
5273064976 -> 5272690944
5273064976 [label=ViewBackward0]
5273066992 -> 5273064976
5273066992 [label=AddmmBackward0]
5273065792 -> 5273066992
5272653632 [label="transformer.layers.1.linear1.bias
(2048)" fillcolor=lightblue]
5272653632 -> 5273065792
5273065792 [label=AccumulateGrad]
5273065120 -> 5273066992
5273065120 [label=ViewBackward0]
5273047296 -> 5273065120
5273066800 -> 5273066992
5273066800 [label=TBackward0]
5273064352 -> 5273066800
5272653552 [label="transformer.layers.1.linear1.weight
(2048, 32)" fillcolor=lightblue]
5272653552 -> 5273064352
5273064352 [label=AccumulateGrad]
5272694208 -> 5272610224
5272694208 [label=TBackward0]
5272690848 -> 5272694208
5272809616 [label="transformer.layers.1.linear2.weight
(32, 2048)" fillcolor=lightblue]
5272809616 -> 5272690848
5272690848 [label=AccumulateGrad]
5273048064 -> 5273047968
5272809936 [label="transformer.layers.1.norm2.weight
(32)" fillcolor=lightblue]
5272809936 -> 5273048064
5273048064 [label=AccumulateGrad]
5273048016 -> 5273047968
5272810016 [label="transformer.layers.1.norm2.bias
(32)" fillcolor=lightblue]
5272810016 -> 5273048016
5273048016 [label=AccumulateGrad]
5273047920 -> 5273048784
5273047920 [label=TransposeBackward0]
5273047488 -> 5273047920
5273047488 [label=ViewBackward0]
5272693056 -> 5273047488
5272693056 [label=AddmmBackward0]
5273065744 -> 5272693056
5272810496 [label="transformer.layers.2.self_attn.out_proj.bias
(32)" fillcolor=lightblue]
5272810496 -> 5273065744
5273065744 [label=AccumulateGrad]
5273064304 -> 5272693056
5273064304 [label=ViewBackward0]
5273064544 -> 5273064304
5273064544 [label=PermuteBackward0]
5273063728 -> 5273064544
5273063728 [label=ScaledDotProductFlashAttentionBackward0]
5273063920 -> 5273063728
5273063920 [label=ViewBackward0]
5273067280 -> 5273063920
5273067280 [label=TransposeBackward0]
5273067376 -> 5273067280
5273067376 [label=ViewBackward0]
5273066944 -> 5273067376
5273066944 [label=SelectBackward0]
5273066560 -> 5273066944
5273066560 [label=CloneBackward0]
5273066656 -> 5273066560
5273066656 [label=SqueezeBackward1]
5273066752 -> 5273066656
5273066752 [label=TransposeBackward0]
5273066368 -> 5273066752
5273066368 [label=UnsqueezeBackward0]
5273065984 -> 5273066368
5273065984 [label=ViewBackward0]
5273066080 -> 5273065984
5273066080 [label=ViewBackward0]
5273066176 -> 5273066080
5273066176 [label=AddmmBackward0]
5273065696 -> 5273066176
5272810176 [label="transformer.layers.2.self_attn.in_proj_bias
(96)" fillcolor=lightblue]
5272810176 -> 5273065696
5273065696 [label=AccumulateGrad]
5273065648 -> 5273066176
5273065648 [label=ViewBackward0]
5273065312 -> 5273065648
5273065312 [label=TransposeBackward0]
5273047968 -> 5273065312
5273067184 -> 5273066176
5273067184 [label=TBackward0]
5273065456 -> 5273067184
5272810096 [label="transformer.layers.2.self_attn.in_proj_weight
(96, 32)" fillcolor=lightblue]
5272810096 -> 5273065456
5273065456 [label=AccumulateGrad]
5273063872 -> 5273063728
5273063872 [label=ViewBackward0]
5273066896 -> 5273063872
5273066896 [label=TransposeBackward0]
5273066608 -> 5273066896
5273066608 [label=ViewBackward0]
5273066320 -> 5273066608
5273066320 [label=SelectBackward0]
5273066560 -> 5273066320
5273065168 -> 5273063728
5273065168 [label=ViewBackward0]
5273066704 -> 5273065168
5273066704 [label=TransposeBackward0]
5273065936 -> 5273066704
5273065936 [label=ViewBackward0]
5273066128 -> 5273065936
5273066128 [label=SelectBackward0]
5273066560 -> 5273066128
5273065552 -> 5272693056
5273065552 [label=TBackward0]
5273067136 -> 5273065552
5272810416 [label="transformer.layers.2.self_attn.out_proj.weight
(32, 32)" fillcolor=lightblue]
5272810416 -> 5273067136
5273067136 [label=AccumulateGrad]
5273048736 -> 5273048640
5272810736 [label="transformer.layers.2.norm1.weight
(32)" fillcolor=lightblue]
5272810736 -> 5273048736
5273048736 [label=AccumulateGrad]
5273048688 -> 5273048640
5272810816 [label="transformer.layers.2.norm1.bias
(32)" fillcolor=lightblue]
5272810816 -> 5273048688
5273048688 [label=AccumulateGrad]
5273048592 -> 5273048928
5273048592 [label=ViewBackward0]
5273047680 -> 5273048592
5273047680 [label=AddmmBackward0]
5273048160 -> 5273047680
5272810656 [label="transformer.layers.2.linear2.bias
(32)" fillcolor=lightblue]
5272810656 -> 5273048160
5273048160 [label=AccumulateGrad]
5273048304 -> 5273047680
5273048304 [label=ViewBackward0]
5273066032 -> 5273048304
5273066032 [label=ReluBackward0]
5273065504 -> 5273066032
5273065504 [label=ViewBackward0]
5273067328 -> 5273065504
5273067328 [label=AddmmBackward0]
5273065408 -> 5273067328
5272810336 [label="transformer.layers.2.linear1.bias
(2048)" fillcolor=lightblue]
5272810336 -> 5273065408
5273065408 [label=AccumulateGrad]
5273065024 -> 5273067328
5273065024 [label=ViewBackward0]
5273048640 -> 5273065024
5273064496 -> 5273067328
5273064496 [label=TBackward0]
5273064688 -> 5273064496
5272810256 [label="transformer.layers.2.linear1.weight
(2048, 32)" fillcolor=lightblue]
5272810256 -> 5273064688
5273064688 [label=AccumulateGrad]
5273067472 -> 5273047680
5273067472 [label=TBackward0]
5273065264 -> 5273067472
5272810576 [label="transformer.layers.2.linear2.weight
(32, 2048)" fillcolor=lightblue]
5272810576 -> 5273065264
5273065264 [label=AccumulateGrad]
5273049408 -> 5273049312
5272810896 [label="transformer.layers.2.norm2.weight
(32)" fillcolor=lightblue]
5272810896 -> 5273049408
5273049408 [label=AccumulateGrad]
5273049360 -> 5273049312
5272810976 [label="transformer.layers.2.norm2.bias
(32)" fillcolor=lightblue]
5272810976 -> 5273049360
5273049360 [label=AccumulateGrad]
5273049264 -> 5273049600
5273049264 [label=TransposeBackward0]
5273048544 -> 5273049264
5273048544 [label=ViewBackward0]
5273064640 -> 5273048544
5273064640 [label=AddmmBackward0]
5273066512 -> 5273064640
5272811456 [label="transformer.layers.3.self_attn.out_proj.bias
(32)" fillcolor=lightblue]
5272811456 -> 5273066512
5273066512 [label=AccumulateGrad]
5273065360 -> 5273064640
5273065360 [label=ViewBackward0]
5273064736 -> 5273065360
5273064736 [label=PermuteBackward0]
5273064400 -> 5273064736
5273064400 [label=ScaledDotProductFlashAttentionBackward0]
5273064016 -> 5273064400
5273064016 [label=ViewBackward0]
5273064208 -> 5273064016
5273064208 [label=TransposeBackward0]
5273063776 -> 5273064208
5273063776 [label=ViewBackward0]
5273063488 -> 5273063776
5273063488 [label=SelectBackward0]
5273063584 -> 5273063488
5273063584 [label=CloneBackward0]
5273067088 -> 5273063584
5273067088 [label=SqueezeBackward1]
5273064112 -> 5273067088
5273064112 [label=TransposeBackward0]
5273075824 -> 5273064112
5273075824 [label=UnsqueezeBackward0]
5273076544 -> 5273075824
5273076544 [label=ViewBackward0]
5273076640 -> 5273076544
5273076640 [label=ViewBackward0]
5273076736 -> 5273076640
5273076736 [label=AddmmBackward0]
5273076832 -> 5273076736
5272811136 [label="transformer.layers.3.self_attn.in_proj_bias
(96)" fillcolor=lightblue]
5272811136 -> 5273076832
5273076832 [label=AccumulateGrad]
5273076784 -> 5273076736
5273076784 [label=ViewBackward0]
5273076928 -> 5273076784
5273076928 [label=TransposeBackward0]
5273049312 -> 5273076928
5273075872 -> 5273076736
5273075872 [label=TBackward0]
5273077072 -> 5273075872
5272811056 [label="transformer.layers.3.self_attn.in_proj_weight
(96, 32)" fillcolor=lightblue]
5272811056 -> 5273077072
5273077072 [label=AccumulateGrad]
5273064448 -> 5273064400
5273064448 [label=ViewBackward0]
5273063824 -> 5273064448
5273063824 [label=TransposeBackward0]
5273063632 -> 5273063824
5273063632 [label=ViewBackward0]
5273064160 -> 5273063632
5273064160 [label=SelectBackward0]
5273063584 -> 5273064160
5273064832 -> 5273064400
5273064832 [label=ViewBackward0]
5273063536 -> 5273064832
5273063536 [label=TransposeBackward0]
5273076496 -> 5273063536
5273076496 [label=ViewBackward0]
5273076688 -> 5273076496
5273076688 [label=SelectBackward0]
5273063584 -> 5273076688
5273067232 -> 5273064640
5273067232 [label=TBackward0]
5273064064 -> 5273067232
5272811376 [label="transformer.layers.3.self_attn.out_proj.weight
(32, 32)" fillcolor=lightblue]
5272811376 -> 5273064064
5273064064 [label=AccumulateGrad]
5273049552 -> 5273049984
5272811696 [label="transformer.layers.3.norm1.weight
(32)" fillcolor=lightblue]
5272811696 -> 5273049552
5273049552 [label=AccumulateGrad]
5273050032 -> 5273049984
5272811776 [label="transformer.layers.3.norm1.bias
(32)" fillcolor=lightblue]
5272811776 -> 5273050032
5273050032 [label=AccumulateGrad]
5273049936 -> 5273049792
5273049936 [label=ViewBackward0]
5273048976 -> 5273049936
5273048976 [label=AddmmBackward0]
5273049168 -> 5273048976
5272811616 [label="transformer.layers.3.linear2.bias
(32)" fillcolor=lightblue]
5272811616 -> 5273049168
5273049168 [label=AccumulateGrad]
5273064784 -> 5273048976
5273064784 [label=ViewBackward0]
5273064256 -> 5273064784
5273064256 [label=ReluBackward0]
5273077120 -> 5273064256
5273077120 [label=ViewBackward0]
5273075776 -> 5273077120
5273075776 [label=AddmmBackward0]
5273077024 -> 5273075776
5272811296 [label="transformer.layers.3.linear1.bias
(2048)" fillcolor=lightblue]
5272811296 -> 5273077024
5273077024 [label=AccumulateGrad]
5273077168 -> 5273075776
5273077168 [label=ViewBackward0]
5273049984 -> 5273077168
5273076592 -> 5273075776
5273076592 [label=TBackward0]
5273077312 -> 5273076592
5272811216 [label="transformer.layers.3.linear1.weight
(2048, 32)" fillcolor=lightblue]
5272811216 -> 5273077312
5273077312 [label=AccumulateGrad]
5273063680 -> 5273048976
5273063680 [label=TBackward0]
5273065072 -> 5273063680
5272811536 [label="transformer.layers.3.linear2.weight
(32, 2048)" fillcolor=lightblue]
5272811536 -> 5273065072
5273065072 [label=AccumulateGrad]
5273050224 -> 5273050608
5272811856 [label="transformer.layers.3.norm2.weight
(32)" fillcolor=lightblue]
5272811856 -> 5273050224
5273050224 [label=AccumulateGrad]
5273050176 -> 5273050608
5272811936 [label="transformer.layers.3.norm2.bias
(32)" fillcolor=lightblue]
5272811936 -> 5273050176
5273050176 [label=AccumulateGrad]
5273050560 -> 5273050416
5099314048 [label="norm.weight
(32)" fillcolor=lightblue]
5099314048 -> 5273050560
5273050560 [label=AccumulateGrad]
5273050512 -> 5273050416
5099989888 [label="norm.bias
(32)" fillcolor=lightblue]
5099989888 -> 5273050512
5273050512 [label=AccumulateGrad]
5273051040 -> 5273047872
5273051040 [label=TBackward0]
5273049840 -> 5273051040
5099945552 [label="fc1.weight
(128, 3200)" fillcolor=lightblue]
5099945552 -> 5273049840
5273049840 [label=AccumulateGrad]
5273048496 -> 5273050992
5099148688 [label="bn1.weight
(128)" fillcolor=lightblue]
5099148688 -> 5273048496
5273048496 [label=AccumulateGrad]
5273049120 -> 5273050992
5272651472 [label="bn1.bias
(128)" fillcolor=lightblue]
5272651472 -> 5273049120
5273049120 [label=AccumulateGrad]
5273047776 -> 5273047584
5273047776 [label=TBackward0]
5273050848 -> 5273047776
5272812096 [label="fc2.weight
(64, 128)" fillcolor=lightblue]
5272812096 -> 5273050848
5273050848 [label=AccumulateGrad]
5273048448 -> 5273048880
5272812256 [label="bn2.weight
(64)" fillcolor=lightblue]
5272812256 -> 5273048448
5273048448 [label=AccumulateGrad]
5273048400 -> 5273048880
5272812336 [label="bn2.bias
(64)" fillcolor=lightblue]
5272812336 -> 5273048400
5273048400 [label=AccumulateGrad]
5273049024 -> 5273049072
5273049024 [label=TBackward0]
5273047200 -> 5273049024
5272812656 [label="fc3.weight
(10, 64)" fillcolor=lightblue]
5272812656 -> 5273047200
5273047200 [label=AccumulateGrad]
5273049072 -> 5272679024
}