From 8b42a971c61da3a194e0a338c73f40835d27de91 Mon Sep 17 00:00:00 2001
From: Bruce Wayne <harald.the.engineer@gmail.com>
Date: Thu, 7 Nov 2024 20:34:50 -0800
Subject: [PATCH 1/2] Update compile3

---
 examples/openpilot/compile3.py | 40 ++++++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/examples/openpilot/compile3.py b/examples/openpilot/compile3.py
index 69331a6db6560..f330cc774e9ba 100644
--- a/examples/openpilot/compile3.py
+++ b/examples/openpilot/compile3.py
@@ -5,8 +5,8 @@
 if "NOLOCALS" not in os.environ: os.environ["NOLOCALS"] = "1"
 if "JIT_BATCH_SIZE" not in os.environ: os.environ["JIT_BATCH_SIZE"] = "0"
 
-from tinygrad import fetch, Tensor, TinyJit, Device, Context, GlobalCounters
-from tinygrad.helpers import OSX, DEBUG, getenv
+from tinygrad import fetch, Tensor, TinyJit, Context, GlobalCounters
+from tinygrad.helpers import DEBUG, getenv
 from tinygrad.tensor import _from_np_dtype
 
 import onnx
@@ -14,14 +14,14 @@
 from extra.onnx import get_run_onnx   # TODO: port to main tinygrad
 
 OPENPILOT_MODEL = sys.argv[1] if len(sys.argv) > 1 else "https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx"
-OUTPUT = "/tmp/openpilot.pkl"
+OUTPUT = sys.argv[2] if len(sys.argv) > 2 else "/tmp/openpilot.pkl"
 
 def compile():
   # hack to fix GPU on OSX: max doesn't work on half, see test/external/external_gpu_fail_osx.py
-  if OSX:
-    from tinygrad.ops import BinaryOps
-    from tinygrad.renderer.cstyle import ClangRenderer, CStyleLanguage
-    CStyleLanguage.code_for_op[BinaryOps.MAX] = ClangRenderer.code_for_op[BinaryOps.MAX]
+  #if OSX:
+  #  from tinygrad.ops import BinaryOps
+  #  from tinygrad.renderer.cstyle import ClangRenderer, CStyleLanguage
+  #  CStyleLanguage.code_for_op[BinaryOps.MAX] = ClangRenderer.code_for_op[BinaryOps.MAX]
 
   Tensor.no_grad = True
   Tensor.training = False
@@ -32,12 +32,30 @@ def compile():
   print("loaded model")
 
   input_shapes = {inp.name:tuple(x.dim_value for x in inp.type.tensor_type.shape.dim) for inp in onnx_model.graph.input}
-  input_types = {inp.name: tensor_dtype_to_np_dtype(inp.type.tensor_type.elem_type) for inp in onnx_model.graph.input}
+  input_types = {inp.name: np.float32 for inp in onnx_model.graph.input}
+  if 'input_img' in input_shapes:
+    input_shapes['input_img'] = (1, 1812, 1928)
+    input_types['input_img'] = np.uint8
+  else:
+    input_types['input_imgs'] = np.uint8
+    input_types['big_input_imgs'] = np.uint8
   Tensor.manual_seed(100)
   new_inputs = {k:Tensor.randn(*shp, dtype=_from_np_dtype(input_types[k])).mul(8).realize() for k,shp in sorted(input_shapes.items())}
   print("created tensors")
+  
+  # TODO remove this hack from dm
+  if 'input_img' in input_shapes: #DM model
+    def fun_to_jit(kwargs):
+      MODEL_WIDTH = 1440
+      MODEL_HEIGHT = 960
+      v_offset = kwargs['input_img'].shape[1] * 2 // 3 - MODEL_HEIGHT
+      h_offset = (kwargs['input_img'].shape[2] - MODEL_WIDTH) // 2
+      kwargs['input_img'] = kwargs['input_img'][:,v_offset:v_offset+MODEL_HEIGHT, h_offset:h_offset+MODEL_WIDTH].reshape((1,-1))
+      return run_onnx(kwargs)
+  else:
+    fun_to_jit = run_onnx
 
-  run_onnx_jit = TinyJit(lambda **kwargs: run_onnx(kwargs), prune=True)
+  run_onnx_jit = TinyJit(lambda **kwargs: fun_to_jit(kwargs), prune=True)
   for i in range(3):
     GlobalCounters.reset()
     print(f"run {i}")
@@ -65,7 +83,11 @@ def test(test_val=None):
   new_inputs = {nm:Tensor.randn(*st.shape, dtype=dtype).mul(8).realize() for nm, (st, _, dtype, _) in
                 sorted(zip(run.captured.expected_names, run.captured.expected_st_vars_dtype_device))}
   for _ in range(20):
+    inputs_numpy = {k:v.numpy() for k,v in new_inputs.items()}
     st = time.perf_counter()
+    for k in new_inputs:
+      if 'img' not in k: # dont need to init img tensors, those are backed by openCL GPU memory
+        new_inputs[k] = Tensor(inputs_numpy[k])
     out = run(**new_inputs)
     mt = time.perf_counter()
     val = out['outputs'].numpy()

From 3a11fa9d7150d0ff85e9db3c8c4f1c9e31bca22f Mon Sep 17 00:00:00 2001
From: Bruce Wayne <harald.the.engineer@gmail.com>
Date: Fri, 8 Nov 2024 09:56:23 -0800
Subject: [PATCH 2/2] Revert hax

---
 examples/openpilot/compile3.py | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/examples/openpilot/compile3.py b/examples/openpilot/compile3.py
index f330cc774e9ba..c781d7cb0be4f 100644
--- a/examples/openpilot/compile3.py
+++ b/examples/openpilot/compile3.py
@@ -34,7 +34,6 @@ def compile():
   input_shapes = {inp.name:tuple(x.dim_value for x in inp.type.tensor_type.shape.dim) for inp in onnx_model.graph.input}
   input_types = {inp.name: np.float32 for inp in onnx_model.graph.input}
   if 'input_img' in input_shapes:
-    input_shapes['input_img'] = (1, 1812, 1928)
     input_types['input_img'] = np.uint8
   else:
     input_types['input_imgs'] = np.uint8
@@ -42,20 +41,8 @@ def compile():
   Tensor.manual_seed(100)
   new_inputs = {k:Tensor.randn(*shp, dtype=_from_np_dtype(input_types[k])).mul(8).realize() for k,shp in sorted(input_shapes.items())}
   print("created tensors")
-  
-  # TODO remove this hack from dm
-  if 'input_img' in input_shapes: #DM model
-    def fun_to_jit(kwargs):
-      MODEL_WIDTH = 1440
-      MODEL_HEIGHT = 960
-      v_offset = kwargs['input_img'].shape[1] * 2 // 3 - MODEL_HEIGHT
-      h_offset = (kwargs['input_img'].shape[2] - MODEL_WIDTH) // 2
-      kwargs['input_img'] = kwargs['input_img'][:,v_offset:v_offset+MODEL_HEIGHT, h_offset:h_offset+MODEL_WIDTH].reshape((1,-1))
-      return run_onnx(kwargs)
-  else:
-    fun_to_jit = run_onnx
 
-  run_onnx_jit = TinyJit(lambda **kwargs: fun_to_jit(kwargs), prune=True)
+  run_onnx_jit = TinyJit(lambda **kwargs: run_onnx(kwargs), prune=True)
   for i in range(3):
     GlobalCounters.reset()
     print(f"run {i}")