Arm backend: Handle +FP+INT for vgf and quantized IO

per · per · commit c4dc7e6d0e00 · 2026-04-14T17:14:07.000+02:00
Change-Id: Ie943e1de816d981c0f09d9bd3683881c03e3000c
Signed-off-by: Per Åstrand &lt;per.astrand@arm.com&gt;
diff --git a/backends/arm/_passes/insert_rescales_pass.py b/backends/arm/_passes/insert_rescales_pass.py
@@ -54,6 +54,13 @@ def _ensure_uint8_io_only(self, graph_module: GraphModule) -> None:
             if node.op == "call_function" and node.target == operator.getitem:
                 if all(user.op == "output" for user in node.users):
                     continue
+            if (
+                node.op == "call_function"
+                and node.target
+                == exir_ops.edge.dim_order_ops._to_dim_order_copy.default
+            ):
+                # dim_order is a view-like transform; allow it to preserve uint8 at IO.
+                continue
             if (
                 node.op == "call_function"
                 and node.target == exir_ops.backend.tosa.RESCALE.default
diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp
@@ -194,19 +194,23 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
       bool supported = 0;
       // 32 bit int (simple non-quantised test cases)
       supported |=
-          (tensor_in.scalar_type() == ScalarType::Int and
+          (tensor_in.scalar_type() == ScalarType::Int &&
            handles.inputs->io[i].elem_size == 4);
       // 8 bit int (IOQDQ pass prepared networks)
       supported |=
-          (tensor_in.scalar_type() == ScalarType::Char and
+          (tensor_in.scalar_type() == ScalarType::Char &&
+           handles.inputs->io[i].elem_size == 1);
+      // 8 bit uint8 (IOQDQ pass prepared networks)
+      supported |=
+          (tensor_in.scalar_type() == ScalarType::Byte &&
            handles.inputs->io[i].elem_size == 1);
       // 16 bit int (IOQDQ pass prepared networks)
       supported |=
-          (tensor_in.scalar_type() == ScalarType::Short and
+          (tensor_in.scalar_type() == ScalarType::Short &&
            handles.inputs->io[i].elem_size == 2);
       // bool (IOQDQ pass prepared networks)
       supported |=
-          (tensor_in.scalar_type() == ScalarType::Bool and
+          (tensor_in.scalar_type() == ScalarType::Bool &&
            handles.inputs->io[i].elem_size == 1);
       if (!supported) {
         ET_LOG(
@@ -222,7 +226,8 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
       // which require permutation.
       bool both_int = tensor_in.scalar_type() == ScalarType::Int &&
           handles.inputs->io[i].elem_size == 4;
-      bool both_char = tensor_in.scalar_type() == ScalarType::Char &&
+      bool both_char = (tensor_in.scalar_type() == ScalarType::Char ||
+                        tensor_in.scalar_type() == ScalarType::Byte) &&
           handles.inputs->io[i].elem_size == 1;
       bool both_short = tensor_in.scalar_type() == ScalarType::Short &&
           handles.inputs->io[i].elem_size == 2;
diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
@@ -160,6 +160,7 @@ def get_vgf_compile_spec(
     compiler_flags: Optional[str] = "",
     custom_path: Optional[str] = None,
     tosa_debug_mode: VgfCompileSpec.DebugMode | None = None,
+    preserve_io_quantization: bool = False,
 ) -> VgfCompileSpec:
     """Get the ArmCompileSpec for the default VGF tests, to modify the compile
     spec before calling .build() to finalize it.
@@ -188,6 +189,9 @@ def get_vgf_compile_spec(
         .dump_debug_info(tosa_debug_mode)
     )
 
+    if preserve_io_quantization:
+        compile_spec._set_preserve_io_quantization(True)
+
     return compile_spec
 
 
diff --git a/backends/arm/test/passes/test_ioquantization_pass.py b/backends/arm/test/passes/test_ioquantization_pass.py
@@ -23,6 +23,7 @@
 from executorch.backends.arm.test.tester.test_pipeline import (
     EthosU55PipelineINT,
     TosaPipelineINT,
+    VgfPipeline,
 )
 from executorch.backends.arm.tosa.specification import (
     TosaLoweringContext,
@@ -402,6 +403,62 @@ def test_quantize_io_tosa_INT_uint8_numeric():
     )
     pipeline.quantizer.set_io(get_uint8_io_quantization_config())
 
+    _run_uint8_io_numeric_pipeline(pipeline, model, calib_input, calib_other)
+
+
+def test_quantize_io_vgf_INT_uint8_numeric():
+    """Run VGF flow with uint8 input and verify numerical output."""
+
+    model = SimpleModel().eval()
+    calib_input = torch.rand(1, 4)
+    calib_other = torch.rand(1, 4)
+
+    pipeline = VgfPipeline(
+        model,
+        (calib_input, calib_other),
+        aten_op=[],
+        exir_op=[],
+        run_on_vulkan_runtime=True,
+        quantize=True,
+        use_to_edge_transform_and_lower=True,
+        preserve_io_quantization=True,
+    )
+
+    pipeline.quantizer.set_io(get_uint8_io_quantization_config())
+
+    if pipeline.has_stage("check_not.exir_quant_nodes"):
+        pipeline.pop_stage("check_not.exir_quant_nodes")
+    _run_uint8_io_numeric_pipeline(pipeline, model, calib_input, calib_other)
+
+
+def test_quantize_io_u55_INT_uint8_numeric():
+    """Run Ethos-U55 flow with uint8 input and verify numerical output."""
+    model = SimpleModel().eval()
+    calib_input = torch.rand(1, 4)
+    calib_other = torch.rand(1, 4)
+
+    if not (
+        common.corstone300_installed()
+        and common.arm_executor_runner_exists("corstone-300")
+    ):
+        pytest.xfail("Did not find Corstone-300 FVP or executor_runner on path")
+
+    pipeline = EthosU55PipelineINT(
+        model,
+        (calib_input, calib_other),
+        aten_ops=[],
+        exir_ops=[],
+        run_on_fvp=True,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.quantizer.set_io(get_uint8_io_quantization_config())
+
+    _run_uint8_io_numeric_pipeline(pipeline, model, calib_input, calib_other)
+
+
+def _run_uint8_io_numeric_pipeline(  # noqa: C901
+    pipeline, model, calib_input, calib_other
+) -> None:
     qparams = {}
 
     def _apply_uint8_io(ep):
@@ -483,7 +540,6 @@ def _dequantize(tensor, scale, zp, qmin, qmax, dtype):
             # Match TOSA's signless int8 representation of unsigned outputs.
             return ref_u8
 
-    pipeline.pop_stage("run_method_and_compare_outputs.original_model")
     # Insert quantization of inputs/outputs after lowering so we can run uint8 IO.
     pipeline.add_stage_after(
         "to_edge_transform_and_lower",
@@ -505,9 +561,14 @@ def _dequantize(tensor, scale, zp, qmin, qmax, dtype):
     )
 
     # Run the pipeline to get the quantization parameters without the standard comparison step
-    pipeline.pop_stage("run_method_and_compare_outputs")
+    if pipeline.has_stage("run_method_and_compare_outputs"):
+        pipeline.pop_stage("run_method_and_compare_outputs")
     pipeline.run()
 
+    assert qparams["in0_dtype"] == torch.uint8
+    assert qparams["in1_dtype"] == torch.uint8
+    assert qparams["out_dtype"] == torch.uint8
+
     # Calculate the calib inputs and outputs uint8 values given the
     # calibrated quantization parameters, so we can run the reference with the same quantized inputs.
     input_tensor = torch.ops.quantized_decomposed.quantize_per_tensor(
@@ -527,24 +588,26 @@ def _dequantize(tensor, scale, zp, qmin, qmax, dtype):
         qparams["in1_dtype"],
     )
 
-    print(
-        f"input_tensor: {input_tensor}, other_input: {other_input}, qparams: {qparams}"
-    )
-
     # Compare against a reference that dequantizes uint8 inputs, runs the float model,
     # and requantizes to match TOSA's signless int8 representation.
     def uint8_compare_callback(reference, output, _qparams):
         # Map signless int8 to uint8
-        output = output.to(torch.uint8)
-        diff = (output.to(torch.int16) - reference.to(torch.int16)).abs()
+        output_u8 = output.to(torch.uint8)
+        reference_u8 = reference.to(torch.uint8)
+        diff = (output_u8.to(torch.int16) - reference_u8.to(torch.int16)).abs()
         if diff.max().item() > 1:
             raise AssertionError(
                 "Output mismatch beyond 1 LSB after uint8 IO flow. "
                 f"max abs diff={diff.max().item()}"
             )
 
+    compare_stage = (
+        StageType.SERIALIZE
+        if pipeline.has_stage("serialize")
+        else StageType.TO_EXECUTORCH
+    )
     pipeline.tester.run_method_and_compare_outputs(
-        stage=StageType.TO_EXECUTORCH,
+        stage=compare_stage,
         inputs=(input_tensor, other_input),
         qtol=1,
         reference_stage_type=StageType.RUN_PASSES,
diff --git a/backends/arm/test/tester/test_pipeline.py b/backends/arm/test/tester/test_pipeline.py
@@ -1184,6 +1184,7 @@ def __init__(
         tosa_extensions: Optional[List[str]] = None,
         tosa_spec: TosaSpecification | str | None = None,
         fold_quantize: bool = True,
+        preserve_io_quantization: bool = False,
     ):
         if tosa_spec is None:
             if tosa_version is None:
@@ -1201,6 +1202,7 @@ def __init__(
             compiler_flags=vgf_compiler_flags,
             custom_path=custom_path,
             tosa_debug_mode=tosa_debug_mode,
+            preserve_io_quantization=preserve_io_quantization,
         )
 
         super().__init__(
diff --git a/backends/arm/tosa/partitioner.py b/backends/arm/tosa/partitioner.py
@@ -152,7 +152,11 @@ def __init__(
         self.additional_checks = additional_checks
 
     def _detag_boundary_nodes(
-        self, module: GraphModule, tag: str, reporter: WhyNoPartitionReporter
+        self,
+        module: GraphModule,
+        tag: str,
+        reporter: WhyNoPartitionReporter,
+        detag_first_fp_node: bool = True,
     ) -> None:
         """De-tag nodes at the partition boundary.
 
@@ -188,7 +192,7 @@ def _detag_boundary_nodes(
                 # Remove tag from quantize node with input outside partition,
                 # or dequantize node with any output outside partition
                 del node.meta["delegation_tag"]
-            elif not is_q_node and not is_dq_node:
+            elif detag_first_fp_node and not is_q_node and not is_dq_node:
                 # For non Q/DQ nodes, remove tag from first node in partition if any input has fp dtype
                 for input in node.all_input_nodes:
                     if is_partitioned(input, tag):
@@ -201,6 +205,21 @@ def _detag_boundary_nodes(
                         del node.meta["delegation_tag"]
                         break
 
+    def _preserve_io_quantization_enabled(self) -> bool:
+        """Return True if IO quantization should be preserved from compile
+        specs.
+        """
+        for spec in self.delegation_spec.compile_specs:
+            if spec.key != "preserve_io_quantization":
+                continue
+            raw = (
+                spec.value.decode()
+                if isinstance(spec.value, (bytes, bytearray))
+                else str(spec.value)
+            )
+            return raw.lower() in ("1", "true", "yes")
+        return False
+
     def _partition_has_invalid_uint8(self, partition: Partition, tag: str) -> bool:
         """Return True if any uint8 appears outside allowed IO nodes.
 
@@ -295,6 +314,15 @@ def _tag_module(  # noqa
                     reporter,
                 )
 
+            if self._preserve_io_quantization_enabled():
+                # Detag boundary Q/DQ to keep IO quantization outside delegate.
+                self._detag_boundary_nodes(
+                    module,
+                    tag,
+                    reporter,
+                    detag_first_fp_node=False,
+                )
+
             if self._partition_has_invalid_uint8(partition, tag):
                 reject_partition(
                     "Partition contained internal uint8 tensors. Uint8 is only supported at IO boundaries for TOSA backends.",